@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -10,6 +10,9 @@
|
|
|
10
10
|
#ifndef EIGEN_PACKET_MATH_GPU_H
|
|
11
11
|
#define EIGEN_PACKET_MATH_GPU_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
@@ -28,29 +31,43 @@ namespace internal {
|
|
|
28
31
|
#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
|
|
29
32
|
#endif
|
|
30
33
|
|
|
34
|
+
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
|
|
35
|
+
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
|
|
36
|
+
// of the functions, while the latter can only deal with one of them.
|
|
37
|
+
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
|
38
|
+
#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
|
|
39
|
+
#else
|
|
40
|
+
#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
|
|
41
|
+
#endif
|
|
42
|
+
|
|
31
43
|
// Make sure this is only available when targeting a GPU: we don't want to
|
|
32
44
|
// introduce conflicts between these packet_traits definitions and the ones
|
|
33
45
|
// we'll use on the host side (SSE, AVX, ...)
|
|
34
46
|
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
|
35
47
|
|
|
36
|
-
template<>
|
|
37
|
-
|
|
48
|
+
template <>
|
|
49
|
+
struct is_arithmetic<float4> {
|
|
50
|
+
enum { value = true };
|
|
51
|
+
};
|
|
52
|
+
template <>
|
|
53
|
+
struct is_arithmetic<double2> {
|
|
54
|
+
enum { value = true };
|
|
55
|
+
};
|
|
38
56
|
|
|
39
|
-
template<>
|
|
40
|
-
{
|
|
57
|
+
template <>
|
|
58
|
+
struct packet_traits<float> : default_packet_traits {
|
|
41
59
|
typedef float4 type;
|
|
42
60
|
typedef float4 half;
|
|
43
61
|
enum {
|
|
44
62
|
Vectorizable = 1,
|
|
45
63
|
AlignedOnScalar = 1,
|
|
46
|
-
size=4,
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
HasExp = 1,
|
|
64
|
+
size = 4,
|
|
65
|
+
|
|
66
|
+
HasDiv = 1,
|
|
67
|
+
HasSin = 0,
|
|
68
|
+
HasCos = 0,
|
|
69
|
+
HasLog = 1,
|
|
70
|
+
HasExp = 1,
|
|
54
71
|
HasSqrt = 1,
|
|
55
72
|
HasRsqrt = 1,
|
|
56
73
|
HasLGamma = 1,
|
|
@@ -69,22 +86,22 @@ template<> struct packet_traits<float> : default_packet_traits
|
|
|
69
86
|
|
|
70
87
|
HasBlend = 0,
|
|
71
88
|
HasFloor = 1,
|
|
89
|
+
HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
|
|
72
90
|
};
|
|
73
91
|
};
|
|
74
92
|
|
|
75
|
-
template<>
|
|
76
|
-
{
|
|
93
|
+
template <>
|
|
94
|
+
struct packet_traits<double> : default_packet_traits {
|
|
77
95
|
typedef double2 type;
|
|
78
96
|
typedef double2 half;
|
|
79
97
|
enum {
|
|
80
98
|
Vectorizable = 1,
|
|
81
99
|
AlignedOnScalar = 1,
|
|
82
|
-
size=2,
|
|
83
|
-
HasHalfPacket = 0,
|
|
100
|
+
size = 2,
|
|
84
101
|
|
|
85
|
-
HasDiv
|
|
86
|
-
HasLog
|
|
87
|
-
HasExp
|
|
102
|
+
HasDiv = 1,
|
|
103
|
+
HasLog = 1,
|
|
104
|
+
HasExp = 1,
|
|
88
105
|
HasSqrt = 1,
|
|
89
106
|
HasRsqrt = 1,
|
|
90
107
|
HasLGamma = 1,
|
|
@@ -100,365 +117,440 @@ template<> struct packet_traits<double> : default_packet_traits
|
|
|
100
117
|
HasGammaSampleDerAlpha = 1,
|
|
101
118
|
HasIGammac = 1,
|
|
102
119
|
HasBetaInc = 1,
|
|
103
|
-
|
|
104
120
|
HasBlend = 0,
|
|
105
|
-
HasFloor = 1,
|
|
106
121
|
};
|
|
107
122
|
};
|
|
108
123
|
|
|
124
|
+
template <>
|
|
125
|
+
struct unpacket_traits<float4> {
|
|
126
|
+
typedef float type;
|
|
127
|
+
enum {
|
|
128
|
+
size = 4,
|
|
129
|
+
alignment = Aligned16,
|
|
130
|
+
vectorizable = true,
|
|
131
|
+
masked_load_available = false,
|
|
132
|
+
masked_store_available = false
|
|
133
|
+
};
|
|
134
|
+
typedef float4 half;
|
|
135
|
+
};
|
|
136
|
+
template <>
|
|
137
|
+
struct unpacket_traits<double2> {
|
|
138
|
+
typedef double type;
|
|
139
|
+
enum {
|
|
140
|
+
size = 2,
|
|
141
|
+
alignment = Aligned16,
|
|
142
|
+
vectorizable = true,
|
|
143
|
+
masked_load_available = false,
|
|
144
|
+
masked_store_available = false
|
|
145
|
+
};
|
|
146
|
+
typedef double2 half;
|
|
147
|
+
};
|
|
109
148
|
|
|
110
|
-
template<>
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
|
|
149
|
+
template <>
|
|
150
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
|
|
114
151
|
return make_float4(from, from, from, from);
|
|
115
152
|
}
|
|
116
|
-
template<>
|
|
153
|
+
template <>
|
|
154
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
|
|
117
155
|
return make_double2(from, from);
|
|
118
156
|
}
|
|
119
157
|
|
|
120
|
-
|
|
121
|
-
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
|
|
122
|
-
// of the functions, while the latter can only deal with one of them.
|
|
123
|
-
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
|
124
|
-
namespace {
|
|
158
|
+
#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
|
|
125
159
|
|
|
126
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
|
|
127
|
-
const float& b) {
|
|
160
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
|
|
128
161
|
return __int_as_float(__float_as_int(a) & __float_as_int(b));
|
|
129
162
|
}
|
|
130
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
|
|
131
|
-
|
|
132
|
-
return __longlong_as_double(__double_as_longlong(a) &
|
|
133
|
-
__double_as_longlong(b));
|
|
163
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
|
|
164
|
+
return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
|
|
134
165
|
}
|
|
135
166
|
|
|
136
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
|
|
137
|
-
const float& b) {
|
|
167
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
|
|
138
168
|
return __int_as_float(__float_as_int(a) | __float_as_int(b));
|
|
139
169
|
}
|
|
140
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
|
|
141
|
-
|
|
142
|
-
return __longlong_as_double(__double_as_longlong(a) |
|
|
143
|
-
__double_as_longlong(b));
|
|
170
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
|
|
171
|
+
return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
|
|
144
172
|
}
|
|
145
173
|
|
|
146
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
|
|
147
|
-
const float& b) {
|
|
174
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
|
|
148
175
|
return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
|
|
149
176
|
}
|
|
150
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
|
|
151
|
-
|
|
152
|
-
return __longlong_as_double(__double_as_longlong(a) ^
|
|
153
|
-
__double_as_longlong(b));
|
|
177
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
|
|
178
|
+
return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
|
|
154
179
|
}
|
|
155
180
|
|
|
156
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
|
|
157
|
-
const float& b) {
|
|
181
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
|
|
158
182
|
return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
|
|
159
183
|
}
|
|
160
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
|
|
161
|
-
|
|
162
|
-
return __longlong_as_double(__double_as_longlong(a) &
|
|
163
|
-
~__double_as_longlong(b));
|
|
184
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
|
|
185
|
+
return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
|
|
164
186
|
}
|
|
165
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
|
|
166
|
-
const float& b) {
|
|
187
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
|
|
167
188
|
return __int_as_float(a == b ? 0xffffffffu : 0u);
|
|
168
189
|
}
|
|
169
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
|
|
170
|
-
const double& b) {
|
|
190
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
|
|
171
191
|
return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
|
|
172
192
|
}
|
|
173
193
|
|
|
174
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
|
|
175
|
-
const float& b) {
|
|
194
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
|
|
176
195
|
return __int_as_float(a < b ? 0xffffffffu : 0u);
|
|
177
196
|
}
|
|
178
|
-
|
|
179
|
-
|
|
197
|
+
|
|
198
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
|
|
180
199
|
return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
|
|
181
200
|
}
|
|
182
201
|
|
|
183
|
-
|
|
202
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
|
|
203
|
+
return __int_as_float(a <= b ? 0xffffffffu : 0u);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
|
|
207
|
+
return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
|
|
208
|
+
}
|
|
184
209
|
|
|
185
210
|
template <>
|
|
186
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
|
|
187
|
-
|
|
188
|
-
return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
|
|
189
|
-
bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
|
|
211
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
|
|
212
|
+
return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
|
|
190
213
|
}
|
|
191
214
|
template <>
|
|
192
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
|
|
193
|
-
const double2& b) {
|
|
215
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
|
|
194
216
|
return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
|
|
195
217
|
}
|
|
196
218
|
|
|
197
219
|
template <>
|
|
198
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
|
|
199
|
-
|
|
200
|
-
return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
|
|
201
|
-
bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
|
|
220
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
|
|
221
|
+
return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
|
|
202
222
|
}
|
|
203
223
|
template <>
|
|
204
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
|
|
205
|
-
const double2& b) {
|
|
224
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
|
|
206
225
|
return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
|
|
207
226
|
}
|
|
208
227
|
|
|
209
228
|
template <>
|
|
210
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
|
|
211
|
-
|
|
212
|
-
return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
|
|
213
|
-
bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
|
|
229
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
|
|
230
|
+
return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
|
|
214
231
|
}
|
|
215
232
|
template <>
|
|
216
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
|
|
217
|
-
const double2& b) {
|
|
233
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
|
|
218
234
|
return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
|
|
219
235
|
}
|
|
220
236
|
|
|
221
237
|
template <>
|
|
222
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
|
|
238
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
|
|
239
|
+
return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
|
|
240
|
+
bitwise_andnot(a.w, b.w));
|
|
226
241
|
}
|
|
227
242
|
template <>
|
|
228
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
|
|
229
|
-
pandnot<double2>(const double2& a, const double2& b) {
|
|
243
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
|
|
230
244
|
return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
|
|
231
245
|
}
|
|
232
246
|
|
|
233
247
|
template <>
|
|
234
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
|
|
235
|
-
|
|
236
|
-
return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
|
|
237
|
-
eq_mask(a.w, b.w));
|
|
248
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
|
|
249
|
+
return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
|
|
238
250
|
}
|
|
239
251
|
template <>
|
|
240
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
|
|
241
|
-
|
|
242
|
-
return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
|
|
243
|
-
lt_mask(a.w, b.w));
|
|
252
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
|
|
253
|
+
return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
|
|
244
254
|
}
|
|
245
255
|
template <>
|
|
246
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
|
247
|
-
|
|
256
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
|
|
257
|
+
return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
|
|
258
|
+
}
|
|
259
|
+
template <>
|
|
260
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
|
|
248
261
|
return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
|
|
249
262
|
}
|
|
250
263
|
template <>
|
|
251
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
|
|
252
|
-
pcmp_lt<double2>(const double2& a, const double2& b) {
|
|
264
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
|
|
253
265
|
return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
|
|
254
266
|
}
|
|
255
|
-
|
|
267
|
+
template <>
|
|
268
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
|
|
269
|
+
return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
|
|
270
|
+
}
|
|
271
|
+
#endif // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
|
|
256
272
|
|
|
257
|
-
template<>
|
|
258
|
-
|
|
273
|
+
template <>
|
|
274
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
|
|
275
|
+
return make_float4(a, a + 1, a + 2, a + 3);
|
|
259
276
|
}
|
|
260
|
-
template<>
|
|
261
|
-
|
|
277
|
+
template <>
|
|
278
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
|
|
279
|
+
return make_double2(a, a + 1);
|
|
262
280
|
}
|
|
263
281
|
|
|
264
|
-
template<>
|
|
265
|
-
|
|
282
|
+
template <>
|
|
283
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
|
|
284
|
+
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
|
|
266
285
|
}
|
|
267
|
-
template<>
|
|
268
|
-
|
|
286
|
+
template <>
|
|
287
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
|
|
288
|
+
return make_double2(a.x + b.x, a.y + b.y);
|
|
269
289
|
}
|
|
270
290
|
|
|
271
|
-
template<>
|
|
272
|
-
|
|
291
|
+
template <>
|
|
292
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
|
|
293
|
+
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
|
|
273
294
|
}
|
|
274
|
-
template<>
|
|
275
|
-
|
|
295
|
+
template <>
|
|
296
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
|
|
297
|
+
return make_double2(a.x - b.x, a.y - b.y);
|
|
276
298
|
}
|
|
277
299
|
|
|
278
|
-
template<>
|
|
300
|
+
template <>
|
|
301
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
|
|
279
302
|
return make_float4(-a.x, -a.y, -a.z, -a.w);
|
|
280
303
|
}
|
|
281
|
-
template<>
|
|
304
|
+
template <>
|
|
305
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
|
|
282
306
|
return make_double2(-a.x, -a.y);
|
|
283
307
|
}
|
|
284
308
|
|
|
285
|
-
template<>
|
|
286
|
-
|
|
309
|
+
template <>
|
|
310
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
|
|
311
|
+
return a;
|
|
312
|
+
}
|
|
313
|
+
template <>
|
|
314
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
|
|
315
|
+
return a;
|
|
316
|
+
}
|
|
287
317
|
|
|
288
|
-
template<>
|
|
289
|
-
|
|
318
|
+
template <>
|
|
319
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
|
|
320
|
+
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
|
|
290
321
|
}
|
|
291
|
-
template<>
|
|
292
|
-
|
|
322
|
+
template <>
|
|
323
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
|
|
324
|
+
return make_double2(a.x * b.x, a.y * b.y);
|
|
293
325
|
}
|
|
294
326
|
|
|
295
|
-
template<>
|
|
296
|
-
|
|
327
|
+
template <>
|
|
328
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
|
|
329
|
+
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
|
|
297
330
|
}
|
|
298
|
-
template<>
|
|
299
|
-
|
|
331
|
+
template <>
|
|
332
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
|
|
333
|
+
return make_double2(a.x / b.x, a.y / b.y);
|
|
300
334
|
}
|
|
301
335
|
|
|
302
|
-
template<>
|
|
336
|
+
template <>
|
|
337
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
|
|
303
338
|
return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
|
|
304
339
|
}
|
|
305
|
-
template<>
|
|
340
|
+
template <>
|
|
341
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
|
|
306
342
|
return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
|
|
307
343
|
}
|
|
308
344
|
|
|
309
|
-
template<>
|
|
345
|
+
template <>
|
|
346
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
|
|
310
347
|
return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
|
|
311
348
|
}
|
|
312
|
-
template<>
|
|
349
|
+
template <>
|
|
350
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
|
|
313
351
|
return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
|
|
314
352
|
}
|
|
315
353
|
|
|
316
|
-
template<>
|
|
354
|
+
template <>
|
|
355
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
|
|
317
356
|
return *reinterpret_cast<const float4*>(from);
|
|
318
357
|
}
|
|
319
358
|
|
|
320
|
-
template<>
|
|
359
|
+
template <>
|
|
360
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
|
|
321
361
|
return *reinterpret_cast<const double2*>(from);
|
|
322
362
|
}
|
|
323
363
|
|
|
324
|
-
template<>
|
|
364
|
+
template <>
|
|
365
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
|
|
325
366
|
return make_float4(from[0], from[1], from[2], from[3]);
|
|
326
367
|
}
|
|
327
|
-
template<>
|
|
368
|
+
template <>
|
|
369
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
|
|
328
370
|
return make_double2(from[0], from[1]);
|
|
329
371
|
}
|
|
330
372
|
|
|
331
|
-
template<>
|
|
373
|
+
template <>
|
|
374
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
|
|
332
375
|
return make_float4(from[0], from[0], from[1], from[1]);
|
|
333
376
|
}
|
|
334
|
-
template<>
|
|
377
|
+
template <>
|
|
378
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
|
|
335
379
|
return make_double2(from[0], from[0]);
|
|
336
380
|
}
|
|
337
381
|
|
|
338
|
-
template<>
|
|
382
|
+
template <>
|
|
383
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
|
|
339
384
|
*reinterpret_cast<float4*>(to) = from;
|
|
340
385
|
}
|
|
341
386
|
|
|
342
|
-
template<>
|
|
387
|
+
template <>
|
|
388
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
|
|
343
389
|
*reinterpret_cast<double2*>(to) = from;
|
|
344
390
|
}
|
|
345
391
|
|
|
346
|
-
template<>
|
|
392
|
+
template <>
|
|
393
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
|
|
347
394
|
to[0] = from.x;
|
|
348
395
|
to[1] = from.y;
|
|
349
396
|
to[2] = from.z;
|
|
350
397
|
to[3] = from.w;
|
|
351
398
|
}
|
|
352
399
|
|
|
353
|
-
template<>
|
|
400
|
+
template <>
|
|
401
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
|
|
354
402
|
to[0] = from.x;
|
|
355
403
|
to[1] = from.y;
|
|
356
404
|
}
|
|
357
405
|
|
|
358
|
-
template<>
|
|
406
|
+
template <>
|
|
359
407
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
|
360
408
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
361
|
-
return __ldg(
|
|
409
|
+
return __ldg(reinterpret_cast<const float4*>(from));
|
|
362
410
|
#else
|
|
363
411
|
return make_float4(from[0], from[1], from[2], from[3]);
|
|
364
412
|
#endif
|
|
365
413
|
}
|
|
366
|
-
template<>
|
|
414
|
+
template <>
|
|
367
415
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
|
368
416
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
369
|
-
return __ldg(
|
|
417
|
+
return __ldg(reinterpret_cast<const double2*>(from));
|
|
370
418
|
#else
|
|
371
419
|
return make_double2(from[0], from[1]);
|
|
372
420
|
#endif
|
|
373
421
|
}
|
|
374
422
|
|
|
375
|
-
template<>
|
|
423
|
+
template <>
|
|
376
424
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
|
377
425
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
378
|
-
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
|
|
426
|
+
return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
|
|
379
427
|
#else
|
|
380
428
|
return make_float4(from[0], from[1], from[2], from[3]);
|
|
381
429
|
#endif
|
|
382
430
|
}
|
|
383
|
-
template<>
|
|
431
|
+
template <>
|
|
384
432
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
|
385
433
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
386
|
-
return make_double2(__ldg(from+0), __ldg(from+1));
|
|
434
|
+
return make_double2(__ldg(from + 0), __ldg(from + 1));
|
|
387
435
|
#else
|
|
388
436
|
return make_double2(from[0], from[1]);
|
|
389
437
|
#endif
|
|
390
438
|
}
|
|
391
439
|
|
|
392
|
-
template<>
|
|
393
|
-
|
|
440
|
+
template <>
|
|
441
|
+
EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
|
|
442
|
+
return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
|
|
394
443
|
}
|
|
395
444
|
|
|
396
|
-
template<>
|
|
397
|
-
|
|
445
|
+
template <>
|
|
446
|
+
EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
|
|
447
|
+
return make_double2(from[0 * stride], from[1 * stride]);
|
|
398
448
|
}
|
|
399
449
|
|
|
400
|
-
template<>
|
|
401
|
-
|
|
402
|
-
to[stride*
|
|
403
|
-
to[stride*
|
|
404
|
-
to[stride*
|
|
450
|
+
template <>
|
|
451
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
|
|
452
|
+
to[stride * 0] = from.x;
|
|
453
|
+
to[stride * 1] = from.y;
|
|
454
|
+
to[stride * 2] = from.z;
|
|
455
|
+
to[stride * 3] = from.w;
|
|
405
456
|
}
|
|
406
|
-
template<>
|
|
407
|
-
|
|
408
|
-
to[stride*
|
|
457
|
+
template <>
|
|
458
|
+
EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
|
|
459
|
+
to[stride * 0] = from.x;
|
|
460
|
+
to[stride * 1] = from.y;
|
|
409
461
|
}
|
|
410
462
|
|
|
411
|
-
template<>
|
|
463
|
+
template <>
|
|
464
|
+
EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
|
|
412
465
|
return a.x;
|
|
413
466
|
}
|
|
414
|
-
template<>
|
|
467
|
+
template <>
|
|
468
|
+
EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
|
|
415
469
|
return a.x;
|
|
416
470
|
}
|
|
417
471
|
|
|
418
|
-
template<>
|
|
472
|
+
template <>
|
|
473
|
+
EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
|
|
419
474
|
return a.x + a.y + a.z + a.w;
|
|
420
475
|
}
|
|
421
|
-
template<>
|
|
476
|
+
template <>
|
|
477
|
+
EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
|
|
422
478
|
return a.x + a.y;
|
|
423
479
|
}
|
|
424
480
|
|
|
425
|
-
template<>
|
|
481
|
+
template <>
|
|
482
|
+
EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
|
|
426
483
|
return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
|
|
427
484
|
}
|
|
428
|
-
template<>
|
|
485
|
+
template <>
|
|
486
|
+
EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
|
|
429
487
|
return fmax(a.x, a.y);
|
|
430
488
|
}
|
|
431
489
|
|
|
432
|
-
template<>
|
|
490
|
+
template <>
|
|
491
|
+
EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
|
|
433
492
|
return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
|
|
434
493
|
}
|
|
435
|
-
template<>
|
|
494
|
+
template <>
|
|
495
|
+
EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
|
|
436
496
|
return fmin(a.x, a.y);
|
|
437
497
|
}
|
|
438
498
|
|
|
439
|
-
template<>
|
|
499
|
+
template <>
|
|
500
|
+
EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
|
|
440
501
|
return a.x * a.y * a.z * a.w;
|
|
441
502
|
}
|
|
442
|
-
template<>
|
|
503
|
+
template <>
|
|
504
|
+
EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
|
|
443
505
|
return a.x * a.y;
|
|
444
506
|
}
|
|
445
507
|
|
|
446
|
-
template<>
|
|
508
|
+
template <>
|
|
509
|
+
EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
|
|
447
510
|
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
|
|
448
511
|
}
|
|
449
|
-
template<>
|
|
512
|
+
template <>
|
|
513
|
+
EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
|
|
450
514
|
return make_double2(fabs(a.x), fabs(a.y));
|
|
451
515
|
}
|
|
452
516
|
|
|
453
|
-
template<>
|
|
517
|
+
template <>
|
|
518
|
+
EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
|
|
454
519
|
return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
|
|
455
520
|
}
|
|
456
|
-
template<>
|
|
521
|
+
template <>
|
|
522
|
+
EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
|
|
457
523
|
return make_double2(floor(a.x), floor(a.y));
|
|
458
524
|
}
|
|
459
525
|
|
|
460
|
-
|
|
461
|
-
|
|
526
|
+
template <>
|
|
527
|
+
EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
|
|
528
|
+
return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
|
|
529
|
+
}
|
|
530
|
+
template <>
|
|
531
|
+
EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
|
|
532
|
+
return make_double2(ceil(a.x), ceil(a.y));
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
template <>
|
|
536
|
+
EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
|
|
537
|
+
return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
|
|
538
|
+
}
|
|
539
|
+
template <>
|
|
540
|
+
EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
|
|
541
|
+
return make_double2(rint(a.x), rint(a.y));
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
template <>
|
|
545
|
+
EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
|
|
546
|
+
return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
|
|
547
|
+
}
|
|
548
|
+
template <>
|
|
549
|
+
EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
|
|
550
|
+
return make_double2(trunc(a.x), trunc(a.y));
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
|
|
462
554
|
float tmp = kernel.packet[0].y;
|
|
463
555
|
kernel.packet[0].y = kernel.packet[1].x;
|
|
464
556
|
kernel.packet[1].x = tmp;
|
|
@@ -484,89 +576,82 @@ ptranspose(PacketBlock<float4,4>& kernel) {
|
|
|
484
576
|
kernel.packet[3].z = tmp;
|
|
485
577
|
}
|
|
486
578
|
|
|
487
|
-
EIGEN_DEVICE_FUNC inline void
|
|
488
|
-
ptranspose(PacketBlock<double2,2>& kernel) {
|
|
579
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
|
|
489
580
|
double tmp = kernel.packet[0].y;
|
|
490
581
|
kernel.packet[0].y = kernel.packet[1].x;
|
|
491
582
|
kernel.packet[1].x = tmp;
|
|
492
583
|
}
|
|
493
584
|
|
|
494
|
-
#endif
|
|
585
|
+
#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
|
495
586
|
|
|
496
|
-
//
|
|
497
|
-
//
|
|
498
|
-
|
|
587
|
+
// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
|
|
588
|
+
// on device. There is no benefit to using them on the host anyways, since they are
|
|
589
|
+
// emulated.
|
|
590
|
+
#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
|
499
591
|
|
|
500
592
|
typedef ulonglong2 Packet4h2;
|
|
501
|
-
template<>
|
|
502
|
-
|
|
593
|
+
template <>
|
|
594
|
+
struct unpacket_traits<Packet4h2> {
|
|
595
|
+
typedef Eigen::half type;
|
|
596
|
+
enum {
|
|
597
|
+
size = 8,
|
|
598
|
+
alignment = Aligned16,
|
|
599
|
+
vectorizable = true,
|
|
600
|
+
masked_load_available = false,
|
|
601
|
+
masked_store_available = false
|
|
602
|
+
};
|
|
603
|
+
typedef Packet4h2 half;
|
|
604
|
+
};
|
|
605
|
+
template <>
|
|
606
|
+
struct is_arithmetic<Packet4h2> {
|
|
607
|
+
enum { value = true };
|
|
608
|
+
};
|
|
503
609
|
|
|
504
|
-
template<>
|
|
505
|
-
|
|
610
|
+
template <>
|
|
611
|
+
struct unpacket_traits<half2> {
|
|
612
|
+
typedef Eigen::half type;
|
|
613
|
+
enum {
|
|
614
|
+
size = 2,
|
|
615
|
+
alignment = Aligned16,
|
|
616
|
+
vectorizable = true,
|
|
617
|
+
masked_load_available = false,
|
|
618
|
+
masked_store_available = false
|
|
619
|
+
};
|
|
620
|
+
typedef half2 half;
|
|
621
|
+
};
|
|
622
|
+
template <>
|
|
623
|
+
struct is_arithmetic<half2> {
|
|
624
|
+
enum { value = true };
|
|
625
|
+
};
|
|
506
626
|
|
|
507
|
-
template<>
|
|
508
|
-
{
|
|
627
|
+
template <>
|
|
628
|
+
struct packet_traits<Eigen::half> : default_packet_traits {
|
|
509
629
|
typedef Packet4h2 type;
|
|
510
630
|
typedef Packet4h2 half;
|
|
511
631
|
enum {
|
|
512
632
|
Vectorizable = 1,
|
|
513
633
|
AlignedOnScalar = 1,
|
|
514
|
-
size=8,
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
HasLog1p = 1
|
|
634
|
+
size = 8,
|
|
635
|
+
HasAdd = 1,
|
|
636
|
+
HasSub = 1,
|
|
637
|
+
HasMul = 1,
|
|
638
|
+
HasDiv = 1,
|
|
639
|
+
HasSqrt = 1,
|
|
640
|
+
HasRsqrt = 1,
|
|
641
|
+
HasExp = 1,
|
|
642
|
+
HasExpm1 = 1,
|
|
643
|
+
HasLog = 1,
|
|
644
|
+
HasLog1p = 1
|
|
526
645
|
};
|
|
527
646
|
};
|
|
528
647
|
|
|
529
|
-
|
|
530
|
-
// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
|
|
531
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
|
|
532
|
-
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
|
533
|
-
return __halves2half2(a, b);
|
|
534
|
-
#else
|
|
535
|
-
// Round-about way since __halves2half2 is a __device__ function.
|
|
536
|
-
return __floats2half2_rn(__half2float(a), __half2float(b));
|
|
537
|
-
#endif
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
|
|
541
|
-
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
|
542
|
-
return __low2half(a);
|
|
543
|
-
#else
|
|
544
|
-
return __float2half(__low2float(a));
|
|
545
|
-
#endif
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
|
|
549
|
-
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
|
550
|
-
return __high2half(a);
|
|
551
|
-
#else
|
|
552
|
-
return __float2half(__high2float(a));
|
|
553
|
-
#endif
|
|
554
|
-
}
|
|
555
|
-
} // namespace
|
|
556
|
-
|
|
557
|
-
template<>
|
|
648
|
+
template <>
|
|
558
649
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
|
559
|
-
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
|
560
650
|
return __half2half2(from);
|
|
561
|
-
#else
|
|
562
|
-
const float f = __half2float(from);
|
|
563
|
-
return __floats2half2_rn(f, f);
|
|
564
|
-
#endif
|
|
565
651
|
}
|
|
566
652
|
|
|
567
653
|
template <>
|
|
568
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
569
|
-
pset1<Packet4h2>(const Eigen::half& from) {
|
|
654
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
|
|
570
655
|
Packet4h2 r;
|
|
571
656
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
572
657
|
p_alias[0] = pset1<half2>(from);
|
|
@@ -576,74 +661,61 @@ pset1<Packet4h2>(const Eigen::half& from) {
|
|
|
576
661
|
return r;
|
|
577
662
|
}
|
|
578
663
|
|
|
579
|
-
// We now need this visible on both host and device.
|
|
580
|
-
// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
|
581
664
|
namespace {
|
|
582
665
|
|
|
583
666
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
|
|
584
667
|
return *reinterpret_cast<const half2*>(from);
|
|
585
668
|
}
|
|
586
669
|
|
|
587
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
|
|
588
|
-
return combine_half(from[0], from[1]);
|
|
589
|
-
}
|
|
670
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
|
|
590
671
|
|
|
591
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*
|
|
592
|
-
return
|
|
672
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
|
|
673
|
+
return __halves2half2(from[0], from[0]);
|
|
593
674
|
}
|
|
594
675
|
|
|
595
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
|
|
596
|
-
const half2& from) {
|
|
676
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
|
|
597
677
|
*reinterpret_cast<half2*>(to) = from;
|
|
598
678
|
}
|
|
599
679
|
|
|
600
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
|
|
601
|
-
|
|
602
|
-
to[
|
|
603
|
-
to[1] = get_half2_high(from);
|
|
680
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
|
|
681
|
+
to[0] = __low2half(from);
|
|
682
|
+
to[1] = __high2half(from);
|
|
604
683
|
}
|
|
605
684
|
|
|
606
|
-
|
|
607
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
|
|
608
|
-
const Eigen::half* from) {
|
|
685
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
|
|
609
686
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
610
687
|
// Input is guaranteed to be properly aligned.
|
|
611
688
|
return __ldg(reinterpret_cast<const half2*>(from));
|
|
612
689
|
#else
|
|
613
|
-
return
|
|
690
|
+
return __halves2half2(*(from + 0), *(from + 1));
|
|
614
691
|
#endif
|
|
615
692
|
}
|
|
616
693
|
|
|
617
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
|
|
618
|
-
const Eigen::half* from) {
|
|
694
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
|
|
619
695
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
620
|
-
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
|
696
|
+
return __halves2half2(__ldg(from + 0), __ldg(from + 1));
|
|
621
697
|
#else
|
|
622
|
-
return
|
|
698
|
+
return __halves2half2(*(from + 0), *(from + 1));
|
|
623
699
|
#endif
|
|
624
700
|
}
|
|
625
701
|
|
|
626
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
|
|
627
|
-
|
|
628
|
-
return combine_half(from[0*stride], from[1*stride]);
|
|
702
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
|
|
703
|
+
return __halves2half2(from[0 * stride], from[1 * stride]);
|
|
629
704
|
}
|
|
630
705
|
|
|
631
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
|
|
632
|
-
|
|
633
|
-
to[stride*
|
|
634
|
-
to[stride*1] = get_half2_high(from);
|
|
706
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
|
|
707
|
+
to[stride * 0] = __low2half(from);
|
|
708
|
+
to[stride * 1] = __high2half(from);
|
|
635
709
|
}
|
|
636
710
|
|
|
637
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
|
|
638
|
-
return get_half2_low(a);
|
|
639
|
-
}
|
|
711
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
|
|
640
712
|
|
|
641
713
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
|
|
642
|
-
half a1 =
|
|
643
|
-
half a2 =
|
|
714
|
+
half a1 = __low2half(a);
|
|
715
|
+
half a2 = __high2half(a);
|
|
644
716
|
half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
|
|
645
717
|
half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
|
|
646
|
-
return
|
|
718
|
+
return __halves2half2(result1, result2);
|
|
647
719
|
}
|
|
648
720
|
|
|
649
721
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
|
|
@@ -656,14 +728,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
|
|
|
656
728
|
return pset1<half2>(false_half);
|
|
657
729
|
}
|
|
658
730
|
|
|
659
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
|
660
|
-
|
|
661
|
-
__half
|
|
662
|
-
__half
|
|
663
|
-
__half
|
|
664
|
-
|
|
665
|
-
kernel.packet[
|
|
666
|
-
kernel.packet[1] = combine_half(a2, b2);
|
|
731
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
|
|
732
|
+
__half a1 = __low2half(kernel.packet[0]);
|
|
733
|
+
__half a2 = __high2half(kernel.packet[0]);
|
|
734
|
+
__half b1 = __low2half(kernel.packet[1]);
|
|
735
|
+
__half b2 = __high2half(kernel.packet[1]);
|
|
736
|
+
kernel.packet[0] = __halves2half2(a1, b1);
|
|
737
|
+
kernel.packet[1] = __halves2half2(a2, b2);
|
|
667
738
|
}
|
|
668
739
|
|
|
669
740
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
|
|
@@ -671,92 +742,95 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
|
|
|
671
742
|
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
|
672
743
|
#else
|
|
673
744
|
float f = __half2float(a) + 1.0f;
|
|
674
|
-
return
|
|
745
|
+
return __halves2half2(a, __float2half(f));
|
|
675
746
|
#endif
|
|
676
747
|
}
|
|
677
748
|
|
|
678
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
half
|
|
682
|
-
half
|
|
683
|
-
|
|
684
|
-
half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
|
|
685
|
-
return combine_half(result_low, result_high);
|
|
749
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
|
|
750
|
+
half mask_low = __low2half(mask);
|
|
751
|
+
half mask_high = __high2half(mask);
|
|
752
|
+
half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
|
|
753
|
+
half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
|
|
754
|
+
return __halves2half2(result_low, result_high);
|
|
686
755
|
}
|
|
687
756
|
|
|
688
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
|
|
689
|
-
const half2& b) {
|
|
757
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
|
|
690
758
|
half true_half = half_impl::raw_uint16_to_half(0xffffu);
|
|
691
759
|
half false_half = half_impl::raw_uint16_to_half(0x0000u);
|
|
692
|
-
half a1 =
|
|
693
|
-
half a2 =
|
|
694
|
-
half b1 =
|
|
695
|
-
half b2 =
|
|
760
|
+
half a1 = __low2half(a);
|
|
761
|
+
half a2 = __high2half(a);
|
|
762
|
+
half b1 = __low2half(b);
|
|
763
|
+
half b2 = __high2half(b);
|
|
696
764
|
half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
|
|
697
765
|
half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
|
|
698
|
-
return
|
|
766
|
+
return __halves2half2(eq1, eq2);
|
|
699
767
|
}
|
|
700
768
|
|
|
701
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
|
|
702
|
-
const half2& b) {
|
|
769
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
|
|
703
770
|
half true_half = half_impl::raw_uint16_to_half(0xffffu);
|
|
704
771
|
half false_half = half_impl::raw_uint16_to_half(0x0000u);
|
|
705
|
-
half a1 =
|
|
706
|
-
half a2 =
|
|
707
|
-
half b1 =
|
|
708
|
-
half b2 =
|
|
772
|
+
half a1 = __low2half(a);
|
|
773
|
+
half a2 = __high2half(a);
|
|
774
|
+
half b1 = __low2half(b);
|
|
775
|
+
half b2 = __high2half(b);
|
|
709
776
|
half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
|
|
710
777
|
half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
|
|
711
|
-
return
|
|
778
|
+
return __halves2half2(eq1, eq2);
|
|
712
779
|
}
|
|
713
780
|
|
|
714
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2
|
|
715
|
-
|
|
716
|
-
half
|
|
717
|
-
half
|
|
718
|
-
half
|
|
719
|
-
half
|
|
781
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
|
|
782
|
+
half true_half = half_impl::raw_uint16_to_half(0xffffu);
|
|
783
|
+
half false_half = half_impl::raw_uint16_to_half(0x0000u);
|
|
784
|
+
half a1 = __low2half(a);
|
|
785
|
+
half a2 = __high2half(a);
|
|
786
|
+
half b1 = __low2half(b);
|
|
787
|
+
half b2 = __high2half(b);
|
|
788
|
+
half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
|
|
789
|
+
half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
|
|
790
|
+
return __halves2half2(eq1, eq2);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
|
|
794
|
+
half a1 = __low2half(a);
|
|
795
|
+
half a2 = __high2half(a);
|
|
796
|
+
half b1 = __low2half(b);
|
|
797
|
+
half b2 = __high2half(b);
|
|
720
798
|
half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
|
|
721
799
|
half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
|
|
722
|
-
return
|
|
800
|
+
return __halves2half2(result1, result2);
|
|
723
801
|
}
|
|
724
802
|
|
|
725
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
|
|
726
|
-
|
|
727
|
-
half
|
|
728
|
-
half
|
|
729
|
-
half
|
|
730
|
-
half b2 = get_half2_high(b);
|
|
803
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
|
|
804
|
+
half a1 = __low2half(a);
|
|
805
|
+
half a2 = __high2half(a);
|
|
806
|
+
half b1 = __low2half(b);
|
|
807
|
+
half b2 = __high2half(b);
|
|
731
808
|
half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
|
|
732
809
|
half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
|
|
733
|
-
return
|
|
810
|
+
return __halves2half2(result1, result2);
|
|
734
811
|
}
|
|
735
812
|
|
|
736
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
|
|
737
|
-
|
|
738
|
-
half
|
|
739
|
-
half
|
|
740
|
-
half
|
|
741
|
-
half b2 = get_half2_high(b);
|
|
813
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
|
|
814
|
+
half a1 = __low2half(a);
|
|
815
|
+
half a2 = __high2half(a);
|
|
816
|
+
half b1 = __low2half(b);
|
|
817
|
+
half b2 = __high2half(b);
|
|
742
818
|
half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
|
|
743
819
|
half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
|
|
744
|
-
return
|
|
820
|
+
return __halves2half2(result1, result2);
|
|
745
821
|
}
|
|
746
822
|
|
|
747
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
|
|
748
|
-
|
|
749
|
-
half
|
|
750
|
-
half
|
|
751
|
-
half
|
|
752
|
-
half b2 = get_half2_high(b);
|
|
823
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
|
|
824
|
+
half a1 = __low2half(a);
|
|
825
|
+
half a2 = __high2half(a);
|
|
826
|
+
half b1 = __low2half(b);
|
|
827
|
+
half b2 = __high2half(b);
|
|
753
828
|
half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
|
|
754
829
|
half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
|
|
755
|
-
return
|
|
830
|
+
return __halves2half2(result1, result2);
|
|
756
831
|
}
|
|
757
832
|
|
|
758
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
|
|
759
|
-
const half2& b) {
|
|
833
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
|
|
760
834
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
761
835
|
return __hadd2(a, b);
|
|
762
836
|
#else
|
|
@@ -770,8 +844,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
|
|
|
770
844
|
#endif
|
|
771
845
|
}
|
|
772
846
|
|
|
773
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
|
|
774
|
-
const half2& b) {
|
|
847
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
|
|
775
848
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
776
849
|
return __hsub2(a, b);
|
|
777
850
|
#else
|
|
@@ -797,8 +870,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
|
|
797
870
|
|
|
798
871
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
|
799
872
|
|
|
800
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
|
|
801
|
-
const half2& b) {
|
|
873
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
|
|
802
874
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
803
875
|
return __hmul2(a, b);
|
|
804
876
|
#else
|
|
@@ -812,11 +884,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
|
|
|
812
884
|
#endif
|
|
813
885
|
}
|
|
814
886
|
|
|
815
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
|
|
816
|
-
const half2& b,
|
|
817
|
-
const half2& c) {
|
|
887
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
|
|
818
888
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
819
|
-
|
|
889
|
+
return __hfma2(a, b, c);
|
|
820
890
|
#else
|
|
821
891
|
float a1 = __low2float(a);
|
|
822
892
|
float a2 = __high2float(a);
|
|
@@ -830,8 +900,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
|
|
|
830
900
|
#endif
|
|
831
901
|
}
|
|
832
902
|
|
|
833
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
|
|
834
|
-
const half2& b) {
|
|
903
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
|
|
835
904
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
836
905
|
return __h2div(a, b);
|
|
837
906
|
#else
|
|
@@ -845,26 +914,24 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
|
|
|
845
914
|
#endif
|
|
846
915
|
}
|
|
847
916
|
|
|
848
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
|
|
849
|
-
const half2& b) {
|
|
917
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
|
|
850
918
|
float a1 = __low2float(a);
|
|
851
919
|
float a2 = __high2float(a);
|
|
852
920
|
float b1 = __low2float(b);
|
|
853
921
|
float b2 = __high2float(b);
|
|
854
|
-
__half r1 = a1 < b1 ?
|
|
855
|
-
__half r2 = a2 < b2 ?
|
|
856
|
-
return
|
|
922
|
+
__half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
|
|
923
|
+
__half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
|
|
924
|
+
return __halves2half2(r1, r2);
|
|
857
925
|
}
|
|
858
926
|
|
|
859
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
|
|
860
|
-
const half2& b) {
|
|
927
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
|
|
861
928
|
float a1 = __low2float(a);
|
|
862
929
|
float a2 = __high2float(a);
|
|
863
930
|
float b1 = __low2float(b);
|
|
864
931
|
float b2 = __high2float(b);
|
|
865
|
-
__half r1 = a1 > b1 ?
|
|
866
|
-
__half r2 = a2 > b2 ?
|
|
867
|
-
return
|
|
932
|
+
__half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
|
|
933
|
+
__half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
|
|
934
|
+
return __halves2half2(r1, r2);
|
|
868
935
|
}
|
|
869
936
|
|
|
870
937
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
|
|
@@ -885,7 +952,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
|
|
|
885
952
|
#else
|
|
886
953
|
float a1 = __low2float(a);
|
|
887
954
|
float a2 = __high2float(a);
|
|
888
|
-
return a1 > a2 ?
|
|
955
|
+
return a1 > a2 ? __low2half(a) : __high2half(a);
|
|
889
956
|
#endif
|
|
890
957
|
}
|
|
891
958
|
|
|
@@ -897,7 +964,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
|
|
|
897
964
|
#else
|
|
898
965
|
float a1 = __low2float(a);
|
|
899
966
|
float a2 = __high2float(a);
|
|
900
|
-
return a1 < a2 ?
|
|
967
|
+
return a1 < a2 ? __low2half(a) : __high2half(a);
|
|
901
968
|
#endif
|
|
902
969
|
}
|
|
903
970
|
|
|
@@ -927,28 +994,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
|
|
|
927
994
|
return __floats2half2_rn(r1, r2);
|
|
928
995
|
}
|
|
929
996
|
|
|
930
|
-
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) ||
|
|
931
|
-
defined(EIGEN_HIP_DEVICE_COMPILE)
|
|
997
|
+
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
|
932
998
|
|
|
933
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
|
934
|
-
half2 plog(const half2& a) {
|
|
935
|
-
return h2log(a);
|
|
936
|
-
}
|
|
999
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
|
|
937
1000
|
|
|
938
|
-
|
|
939
|
-
half2 pexp(const half2& a) {
|
|
940
|
-
return h2exp(a);
|
|
941
|
-
}
|
|
1001
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
|
|
942
1002
|
|
|
943
|
-
|
|
944
|
-
half2 psqrt(const half2& a) {
|
|
945
|
-
return h2sqrt(a);
|
|
946
|
-
}
|
|
1003
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
|
|
947
1004
|
|
|
948
|
-
|
|
949
|
-
half2 prsqrt(const half2& a) {
|
|
950
|
-
return h2rsqrt(a);
|
|
951
|
-
}
|
|
1005
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
|
|
952
1006
|
|
|
953
1007
|
#else
|
|
954
1008
|
|
|
@@ -984,18 +1038,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
|
|
|
984
1038
|
return __floats2half2_rn(r1, r2);
|
|
985
1039
|
}
|
|
986
1040
|
#endif
|
|
987
|
-
}
|
|
1041
|
+
} // namespace
|
|
988
1042
|
|
|
989
1043
|
template <>
|
|
990
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
991
|
-
pload<Packet4h2>(const Eigen::half* from) {
|
|
1044
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
|
|
992
1045
|
return *reinterpret_cast<const Packet4h2*>(from);
|
|
993
1046
|
}
|
|
994
1047
|
|
|
995
1048
|
// unaligned load;
|
|
996
1049
|
template <>
|
|
997
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
998
|
-
ploadu<Packet4h2>(const Eigen::half* from) {
|
|
1050
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
|
|
999
1051
|
Packet4h2 r;
|
|
1000
1052
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
1001
1053
|
p_alias[0] = ploadu(from + 0);
|
|
@@ -1006,8 +1058,7 @@ ploadu<Packet4h2>(const Eigen::half* from) {
|
|
|
1006
1058
|
}
|
|
1007
1059
|
|
|
1008
1060
|
template <>
|
|
1009
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1010
|
-
ploaddup<Packet4h2>(const Eigen::half* from) {
|
|
1061
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
|
|
1011
1062
|
Packet4h2 r;
|
|
1012
1063
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
1013
1064
|
p_alias[0] = ploaddup(from + 0);
|
|
@@ -1018,24 +1069,21 @@ ploaddup<Packet4h2>(const Eigen::half* from) {
|
|
|
1018
1069
|
}
|
|
1019
1070
|
|
|
1020
1071
|
template <>
|
|
1021
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
|
|
1022
|
-
Eigen::half* to, const Packet4h2& from) {
|
|
1072
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
|
|
1023
1073
|
*reinterpret_cast<Packet4h2*>(to) = from;
|
|
1024
1074
|
}
|
|
1025
1075
|
|
|
1026
1076
|
template <>
|
|
1027
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
|
|
1028
|
-
Eigen::half* to, const Packet4h2& from) {
|
|
1077
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
|
|
1029
1078
|
const half2* from_alias = reinterpret_cast<const half2*>(&from);
|
|
1030
|
-
pstoreu(to + 0,from_alias[0]);
|
|
1031
|
-
pstoreu(to + 2,from_alias[1]);
|
|
1032
|
-
pstoreu(to + 4,from_alias[2]);
|
|
1033
|
-
pstoreu(to + 6,from_alias[3]);
|
|
1079
|
+
pstoreu(to + 0, from_alias[0]);
|
|
1080
|
+
pstoreu(to + 2, from_alias[1]);
|
|
1081
|
+
pstoreu(to + 4, from_alias[2]);
|
|
1082
|
+
pstoreu(to + 6, from_alias[3]);
|
|
1034
1083
|
}
|
|
1035
1084
|
|
|
1036
1085
|
template <>
|
|
1037
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
|
|
1038
|
-
ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
|
|
1086
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
|
|
1039
1087
|
#if defined(EIGEN_GPU_HAS_LDG)
|
|
1040
1088
|
Packet4h2 r;
|
|
1041
1089
|
r = __ldg(reinterpret_cast<const Packet4h2*>(from));
|
|
@@ -1052,8 +1100,7 @@ ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
|
|
|
1052
1100
|
}
|
|
1053
1101
|
|
|
1054
1102
|
template <>
|
|
1055
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
|
|
1056
|
-
ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
|
|
1103
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
|
|
1057
1104
|
Packet4h2 r;
|
|
1058
1105
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1059
1106
|
r_alias[0] = ploadt_ro_unaligned(from + 0);
|
|
@@ -1064,20 +1111,19 @@ ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
|
|
|
1064
1111
|
}
|
|
1065
1112
|
|
|
1066
1113
|
template <>
|
|
1067
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1068
|
-
pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
|
|
1114
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
|
|
1069
1115
|
Packet4h2 r;
|
|
1070
1116
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
1071
|
-
p_alias[0] =
|
|
1072
|
-
p_alias[1] =
|
|
1073
|
-
p_alias[2] =
|
|
1074
|
-
p_alias[3] =
|
|
1117
|
+
p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
|
|
1118
|
+
p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
|
|
1119
|
+
p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
|
|
1120
|
+
p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
|
|
1075
1121
|
return r;
|
|
1076
1122
|
}
|
|
1077
1123
|
|
|
1078
1124
|
template <>
|
|
1079
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
|
|
1080
|
-
|
|
1125
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
|
|
1126
|
+
Index stride) {
|
|
1081
1127
|
const half2* from_alias = reinterpret_cast<const half2*>(&from);
|
|
1082
1128
|
pscatter(to + stride * 0, from_alias[0], stride);
|
|
1083
1129
|
pscatter(to + stride * 2, from_alias[1], stride);
|
|
@@ -1086,14 +1132,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
|
|
|
1086
1132
|
}
|
|
1087
1133
|
|
|
1088
1134
|
template <>
|
|
1089
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
|
|
1090
|
-
const Packet4h2& a) {
|
|
1135
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
|
|
1091
1136
|
return pfirst(*(reinterpret_cast<const half2*>(&a)));
|
|
1092
1137
|
}
|
|
1093
1138
|
|
|
1094
1139
|
template <>
|
|
1095
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
|
|
1096
|
-
const Packet4h2& a) {
|
|
1140
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
|
|
1097
1141
|
Packet4h2 r;
|
|
1098
1142
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
1099
1143
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1105,8 +1149,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
|
|
|
1105
1149
|
}
|
|
1106
1150
|
|
|
1107
1151
|
template <>
|
|
1108
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
|
|
1109
|
-
const Packet4h2& /*a*/) {
|
|
1152
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
|
|
1110
1153
|
half true_half = half_impl::raw_uint16_to_half(0xffffu);
|
|
1111
1154
|
return pset1<Packet4h2>(true_half);
|
|
1112
1155
|
}
|
|
@@ -1117,9 +1160,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2
|
|
|
1117
1160
|
return pset1<Packet4h2>(false_half);
|
|
1118
1161
|
}
|
|
1119
1162
|
|
|
1120
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
|
|
1121
|
-
|
|
1122
|
-
|
|
1163
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
|
|
1164
|
+
double* d_row3, double* d_row4, double* d_row5,
|
|
1165
|
+
double* d_row6, double* d_row7) {
|
|
1123
1166
|
double d_tmp;
|
|
1124
1167
|
d_tmp = d_row0[1];
|
|
1125
1168
|
d_row0[1] = d_row4[0];
|
|
@@ -1138,8 +1181,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
|
|
|
1138
1181
|
d_row7[0] = d_tmp;
|
|
1139
1182
|
}
|
|
1140
1183
|
|
|
1141
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
|
|
1142
|
-
|
|
1184
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
|
|
1185
|
+
half2* f_row3) {
|
|
1143
1186
|
half2 f_tmp;
|
|
1144
1187
|
f_tmp = f_row0[1];
|
|
1145
1188
|
f_row0[1] = f_row2[0];
|
|
@@ -1150,18 +1193,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
|
|
|
1150
1193
|
f_row3[0] = f_tmp;
|
|
1151
1194
|
}
|
|
1152
1195
|
|
|
1153
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
|
1154
|
-
|
|
1155
|
-
__half
|
|
1156
|
-
__half
|
|
1157
|
-
__half
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
f1 = combine_half(a2, b2);
|
|
1196
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
|
|
1197
|
+
__half a1 = __low2half(f0);
|
|
1198
|
+
__half a2 = __high2half(f0);
|
|
1199
|
+
__half b1 = __low2half(f1);
|
|
1200
|
+
__half b2 = __high2half(f1);
|
|
1201
|
+
f0 = __halves2half2(a1, b1);
|
|
1202
|
+
f1 = __halves2half2(a2, b2);
|
|
1161
1203
|
}
|
|
1162
1204
|
|
|
1163
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
|
1164
|
-
ptranspose(PacketBlock<Packet4h2,8>& kernel) {
|
|
1205
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
|
|
1165
1206
|
double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
|
|
1166
1207
|
double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
|
|
1167
1208
|
double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
|
|
@@ -1170,9 +1211,7 @@ ptranspose(PacketBlock<Packet4h2,8>& kernel) {
|
|
|
1170
1211
|
double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
|
|
1171
1212
|
double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
|
|
1172
1213
|
double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
|
|
1173
|
-
ptranspose_double(d_row0, d_row1, d_row2, d_row3,
|
|
1174
|
-
d_row4, d_row5, d_row6, d_row7);
|
|
1175
|
-
|
|
1214
|
+
ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
|
|
1176
1215
|
|
|
1177
1216
|
half2* f_row0 = reinterpret_cast<half2*>(d_row0);
|
|
1178
1217
|
half2* f_row1 = reinterpret_cast<half2*>(d_row1);
|
|
@@ -1213,23 +1252,18 @@ ptranspose(PacketBlock<Packet4h2,8>& kernel) {
|
|
|
1213
1252
|
ptranspose_half(f_row0[1], f_row1[1]);
|
|
1214
1253
|
ptranspose_half(f_row2[0], f_row3[0]);
|
|
1215
1254
|
ptranspose_half(f_row2[1], f_row3[1]);
|
|
1216
|
-
|
|
1217
1255
|
}
|
|
1218
1256
|
|
|
1219
1257
|
template <>
|
|
1220
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1221
|
-
plset<Packet4h2>(const Eigen::half& a) {
|
|
1258
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
|
|
1222
1259
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
|
1223
1260
|
|
|
1224
1261
|
Packet4h2 r;
|
|
1225
1262
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
1226
1263
|
p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
|
1227
|
-
p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
|
|
1228
|
-
|
|
1229
|
-
p_alias[
|
|
1230
|
-
__hadd(a, __float2half(5.0f)));
|
|
1231
|
-
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
|
|
1232
|
-
__hadd(a, __float2half(7.0f)));
|
|
1264
|
+
p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
|
|
1265
|
+
p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
|
|
1266
|
+
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
|
|
1233
1267
|
return r;
|
|
1234
1268
|
#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
|
1235
1269
|
Packet4h2 r;
|
|
@@ -1237,8 +1271,8 @@ plset<Packet4h2>(const Eigen::half& a) {
|
|
|
1237
1271
|
|
|
1238
1272
|
half2 b = pset1<half2>(a);
|
|
1239
1273
|
half2 c;
|
|
1240
|
-
half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
|
|
1241
|
-
half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
|
|
1274
|
+
half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
|
|
1275
|
+
half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
|
|
1242
1276
|
|
|
1243
1277
|
c = __hadd2(b, half_offset0);
|
|
1244
1278
|
r_alias[0] = plset(__low2half(c));
|
|
@@ -1254,18 +1288,17 @@ plset<Packet4h2>(const Eigen::half& a) {
|
|
|
1254
1288
|
float f = __half2float(a);
|
|
1255
1289
|
Packet4h2 r;
|
|
1256
1290
|
half2* p_alias = reinterpret_cast<half2*>(&r);
|
|
1257
|
-
p_alias[0] =
|
|
1258
|
-
p_alias[1] =
|
|
1259
|
-
p_alias[2] =
|
|
1260
|
-
p_alias[3] =
|
|
1291
|
+
p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
|
|
1292
|
+
p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
|
|
1293
|
+
p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
|
|
1294
|
+
p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
|
|
1261
1295
|
return r;
|
|
1262
1296
|
#endif
|
|
1263
1297
|
}
|
|
1264
1298
|
|
|
1265
1299
|
template <>
|
|
1266
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1267
|
-
|
|
1268
|
-
const Packet4h2& b) {
|
|
1300
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
|
|
1301
|
+
const Packet4h2& b) {
|
|
1269
1302
|
Packet4h2 r;
|
|
1270
1303
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1271
1304
|
const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
|
|
@@ -1279,8 +1312,7 @@ pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
|
|
|
1279
1312
|
}
|
|
1280
1313
|
|
|
1281
1314
|
template <>
|
|
1282
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1283
|
-
pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1315
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1284
1316
|
Packet4h2 r;
|
|
1285
1317
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1286
1318
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1293,8 +1325,33 @@ pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
|
1293
1325
|
}
|
|
1294
1326
|
|
|
1295
1327
|
template <>
|
|
1296
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1297
|
-
|
|
1328
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1329
|
+
Packet4h2 r;
|
|
1330
|
+
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1331
|
+
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
1332
|
+
const half2* b_alias = reinterpret_cast<const half2*>(&b);
|
|
1333
|
+
r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
|
|
1334
|
+
r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
|
|
1335
|
+
r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
|
|
1336
|
+
r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
|
|
1337
|
+
return r;
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
template <>
|
|
1341
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1342
|
+
Packet4h2 r;
|
|
1343
|
+
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1344
|
+
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
1345
|
+
const half2* b_alias = reinterpret_cast<const half2*>(&b);
|
|
1346
|
+
r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
|
|
1347
|
+
r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
|
|
1348
|
+
r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
|
|
1349
|
+
r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
|
|
1350
|
+
return r;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
template <>
|
|
1354
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1298
1355
|
Packet4h2 r;
|
|
1299
1356
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1300
1357
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1307,8 +1364,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
|
|
|
1307
1364
|
}
|
|
1308
1365
|
|
|
1309
1366
|
template <>
|
|
1310
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
|
|
1311
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1367
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1312
1368
|
Packet4h2 r;
|
|
1313
1369
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1314
1370
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1321,8 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
|
|
|
1321
1377
|
}
|
|
1322
1378
|
|
|
1323
1379
|
template <>
|
|
1324
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
|
|
1325
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1380
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1326
1381
|
Packet4h2 r;
|
|
1327
1382
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1328
1383
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1335,8 +1390,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
|
|
|
1335
1390
|
}
|
|
1336
1391
|
|
|
1337
1392
|
template <>
|
|
1338
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1339
|
-
pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1393
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1340
1394
|
Packet4h2 r;
|
|
1341
1395
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1342
1396
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1349,8 +1403,7 @@ pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
|
1349
1403
|
}
|
|
1350
1404
|
|
|
1351
1405
|
template <>
|
|
1352
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
|
|
1353
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1406
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1354
1407
|
Packet4h2 r;
|
|
1355
1408
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1356
1409
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1363,8 +1416,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
|
|
|
1363
1416
|
}
|
|
1364
1417
|
|
|
1365
1418
|
template <>
|
|
1366
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
|
|
1367
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1419
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1368
1420
|
Packet4h2 r;
|
|
1369
1421
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1370
1422
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1394,8 +1446,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
|
|
|
1394
1446
|
}
|
|
1395
1447
|
|
|
1396
1448
|
template <>
|
|
1397
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
|
|
1398
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1449
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1399
1450
|
Packet4h2 r;
|
|
1400
1451
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1401
1452
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1408,8 +1459,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
|
|
|
1408
1459
|
}
|
|
1409
1460
|
|
|
1410
1461
|
template <>
|
|
1411
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
|
|
1412
|
-
|
|
1462
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
|
|
1463
|
+
const Packet4h2& c) {
|
|
1413
1464
|
Packet4h2 r;
|
|
1414
1465
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1415
1466
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1423,8 +1474,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
|
|
|
1423
1474
|
}
|
|
1424
1475
|
|
|
1425
1476
|
template <>
|
|
1426
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
|
|
1427
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1477
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1428
1478
|
Packet4h2 r;
|
|
1429
1479
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1430
1480
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1437,8 +1487,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
|
|
|
1437
1487
|
}
|
|
1438
1488
|
|
|
1439
1489
|
template <>
|
|
1440
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
|
|
1441
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1490
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1442
1491
|
Packet4h2 r;
|
|
1443
1492
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1444
1493
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1451,8 +1500,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
|
|
|
1451
1500
|
}
|
|
1452
1501
|
|
|
1453
1502
|
template <>
|
|
1454
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
|
|
1455
|
-
const Packet4h2& a, const Packet4h2& b) {
|
|
1503
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
|
|
1456
1504
|
Packet4h2 r;
|
|
1457
1505
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1458
1506
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1465,64 +1513,53 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
|
|
|
1465
1513
|
}
|
|
1466
1514
|
|
|
1467
1515
|
template <>
|
|
1468
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
|
|
1469
|
-
const Packet4h2& a) {
|
|
1516
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
|
|
1470
1517
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
1471
1518
|
|
|
1472
|
-
return predux(a_alias[0]) + predux(a_alias[1]) +
|
|
1473
|
-
predux(a_alias[2]) + predux(a_alias[3]);
|
|
1519
|
+
return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
|
|
1474
1520
|
}
|
|
1475
1521
|
|
|
1476
1522
|
template <>
|
|
1477
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
|
|
1478
|
-
const Packet4h2& a) {
|
|
1523
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
|
|
1479
1524
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
1480
|
-
half2 m0 =
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
predux_max(a_alias[3]));
|
|
1484
|
-
__half first = predux_max(m0);
|
|
1525
|
+
half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
|
|
1526
|
+
half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
|
|
1527
|
+
__half first = predux_max(m0);
|
|
1485
1528
|
__half second = predux_max(m1);
|
|
1486
1529
|
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
|
1487
1530
|
return (__hgt(first, second) ? first : second);
|
|
1488
1531
|
#else
|
|
1489
|
-
float ffirst
|
|
1532
|
+
float ffirst = __half2float(first);
|
|
1490
1533
|
float fsecond = __half2float(second);
|
|
1491
|
-
return (ffirst > fsecond)? first: second;
|
|
1534
|
+
return (ffirst > fsecond) ? first : second;
|
|
1492
1535
|
#endif
|
|
1493
1536
|
}
|
|
1494
1537
|
|
|
1495
1538
|
template <>
|
|
1496
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
|
|
1497
|
-
const Packet4h2& a) {
|
|
1539
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
|
|
1498
1540
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
1499
|
-
half2 m0 =
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
predux_min(a_alias[3]));
|
|
1503
|
-
__half first = predux_min(m0);
|
|
1541
|
+
half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
|
|
1542
|
+
half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
|
|
1543
|
+
__half first = predux_min(m0);
|
|
1504
1544
|
__half second = predux_min(m1);
|
|
1505
1545
|
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
|
|
1506
1546
|
return (__hlt(first, second) ? first : second);
|
|
1507
1547
|
#else
|
|
1508
|
-
float ffirst
|
|
1548
|
+
float ffirst = __half2float(first);
|
|
1509
1549
|
float fsecond = __half2float(second);
|
|
1510
|
-
return (ffirst < fsecond)? first: second;
|
|
1550
|
+
return (ffirst < fsecond) ? first : second;
|
|
1511
1551
|
#endif
|
|
1512
1552
|
}
|
|
1513
1553
|
|
|
1514
1554
|
// likely overflow/underflow
|
|
1515
1555
|
template <>
|
|
1516
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
|
|
1517
|
-
const Packet4h2& a) {
|
|
1556
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
|
|
1518
1557
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
1519
|
-
return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
|
|
1520
|
-
pmul(a_alias[2], a_alias[3])));
|
|
1558
|
+
return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
|
|
1521
1559
|
}
|
|
1522
1560
|
|
|
1523
1561
|
template <>
|
|
1524
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1525
|
-
plog1p<Packet4h2>(const Packet4h2& a) {
|
|
1562
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
|
|
1526
1563
|
Packet4h2 r;
|
|
1527
1564
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1528
1565
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1534,8 +1571,7 @@ plog1p<Packet4h2>(const Packet4h2& a) {
|
|
|
1534
1571
|
}
|
|
1535
1572
|
|
|
1536
1573
|
template <>
|
|
1537
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1538
|
-
pexpm1<Packet4h2>(const Packet4h2& a) {
|
|
1574
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
|
|
1539
1575
|
Packet4h2 r;
|
|
1540
1576
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1541
1577
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1583,8 +1619,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2
|
|
|
1583
1619
|
}
|
|
1584
1620
|
|
|
1585
1621
|
template <>
|
|
1586
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
|
1587
|
-
prsqrt<Packet4h2>(const Packet4h2& a) {
|
|
1622
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
|
|
1588
1623
|
Packet4h2 r;
|
|
1589
1624
|
half2* r_alias = reinterpret_cast<half2*>(&r);
|
|
1590
1625
|
const half2* a_alias = reinterpret_cast<const half2*>(&a);
|
|
@@ -1597,9 +1632,8 @@ prsqrt<Packet4h2>(const Packet4h2& a) {
|
|
|
1597
1632
|
|
|
1598
1633
|
// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
|
|
1599
1634
|
// the implementation of GPU half reduction.
|
|
1600
|
-
template<>
|
|
1601
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
|
|
1602
|
-
const half2& b) {
|
|
1635
|
+
template <>
|
|
1636
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
|
1603
1637
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
1604
1638
|
return __hadd2(a, b);
|
|
1605
1639
|
#else
|
|
@@ -1613,9 +1647,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
|
|
|
1613
1647
|
#endif
|
|
1614
1648
|
}
|
|
1615
1649
|
|
|
1616
|
-
template<>
|
|
1617
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
|
|
1618
|
-
const half2& b) {
|
|
1650
|
+
template <>
|
|
1651
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
|
1619
1652
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
1620
1653
|
return __hmul2(a, b);
|
|
1621
1654
|
#else
|
|
@@ -1629,9 +1662,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
|
|
|
1629
1662
|
#endif
|
|
1630
1663
|
}
|
|
1631
1664
|
|
|
1632
|
-
template<>
|
|
1633
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
|
|
1634
|
-
const half2& b) {
|
|
1665
|
+
template <>
|
|
1666
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
|
1635
1667
|
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
|
|
1636
1668
|
return __h2div(a, b);
|
|
1637
1669
|
#else
|
|
@@ -1645,41 +1677,36 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
|
|
|
1645
1677
|
#endif
|
|
1646
1678
|
}
|
|
1647
1679
|
|
|
1648
|
-
template<>
|
|
1649
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
|
|
1650
|
-
const half2& b) {
|
|
1680
|
+
template <>
|
|
1681
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
|
1651
1682
|
float a1 = __low2float(a);
|
|
1652
1683
|
float a2 = __high2float(a);
|
|
1653
1684
|
float b1 = __low2float(b);
|
|
1654
1685
|
float b2 = __high2float(b);
|
|
1655
|
-
__half r1 = a1 < b1 ?
|
|
1656
|
-
__half r2 = a2 < b2 ?
|
|
1657
|
-
return
|
|
1686
|
+
__half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
|
|
1687
|
+
__half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
|
|
1688
|
+
return __halves2half2(r1, r2);
|
|
1658
1689
|
}
|
|
1659
1690
|
|
|
1660
|
-
template<>
|
|
1661
|
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
|
|
1662
|
-
const half2& b) {
|
|
1691
|
+
template <>
|
|
1692
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
|
1663
1693
|
float a1 = __low2float(a);
|
|
1664
1694
|
float a2 = __high2float(a);
|
|
1665
1695
|
float b1 = __low2float(b);
|
|
1666
1696
|
float b2 = __high2float(b);
|
|
1667
|
-
__half r1 = a1 > b1 ?
|
|
1668
|
-
__half r2 = a2 > b2 ?
|
|
1669
|
-
return
|
|
1697
|
+
__half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
|
|
1698
|
+
__half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
|
|
1699
|
+
return __halves2half2(r1, r2);
|
|
1670
1700
|
}
|
|
1671
1701
|
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
|
1702
|
+
#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
|
|
1675
1703
|
|
|
1676
1704
|
#undef EIGEN_GPU_HAS_LDG
|
|
1677
1705
|
#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
|
|
1678
1706
|
#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
|
|
1679
1707
|
|
|
1680
|
-
}
|
|
1681
|
-
|
|
1682
|
-
} // end namespace Eigen
|
|
1708
|
+
} // end namespace internal
|
|
1683
1709
|
|
|
1710
|
+
} // end namespace Eigen
|
|
1684
1711
|
|
|
1685
|
-
#endif
|
|
1712
|
+
#endif // EIGEN_PACKET_MATH_GPU_H
|