@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -16,73 +16,214 @@
|
|
|
16
16
|
#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
|
|
17
17
|
#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
|
|
18
18
|
|
|
19
|
+
// IWYU pragma: private
|
|
20
|
+
#include "../../InternalHeaderCheck.h"
|
|
21
|
+
|
|
19
22
|
namespace Eigen {
|
|
20
23
|
namespace internal {
|
|
21
24
|
|
|
22
25
|
// Creates a Scalar integer type with same bit-width.
|
|
23
|
-
template<typename T>
|
|
24
|
-
|
|
25
|
-
template<>
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
template
|
|
30
|
-
|
|
26
|
+
template <typename T>
|
|
27
|
+
struct make_integer;
|
|
28
|
+
template <>
|
|
29
|
+
struct make_integer<float> {
|
|
30
|
+
typedef numext::int32_t type;
|
|
31
|
+
};
|
|
32
|
+
template <>
|
|
33
|
+
struct make_integer<double> {
|
|
34
|
+
typedef numext::int64_t type;
|
|
35
|
+
};
|
|
36
|
+
template <>
|
|
37
|
+
struct make_integer<half> {
|
|
38
|
+
typedef numext::int16_t type;
|
|
39
|
+
};
|
|
40
|
+
template <>
|
|
41
|
+
struct make_integer<bfloat16> {
|
|
42
|
+
typedef numext::int16_t type;
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
/* polevl (modified for Eigen)
|
|
46
|
+
*
|
|
47
|
+
* Evaluate polynomial
|
|
48
|
+
*
|
|
49
|
+
*
|
|
50
|
+
*
|
|
51
|
+
* SYNOPSIS:
|
|
52
|
+
*
|
|
53
|
+
* int N;
|
|
54
|
+
* Scalar x, y, coef[N+1];
|
|
55
|
+
*
|
|
56
|
+
* y = polevl<decltype(x), N>( x, coef);
|
|
57
|
+
*
|
|
58
|
+
*
|
|
59
|
+
*
|
|
60
|
+
* DESCRIPTION:
|
|
61
|
+
*
|
|
62
|
+
* Evaluates polynomial of degree N:
|
|
63
|
+
*
|
|
64
|
+
* 2 N
|
|
65
|
+
* y = C + C x + C x +...+ C x
|
|
66
|
+
* 0 1 2 N
|
|
67
|
+
*
|
|
68
|
+
* Coefficients are stored in reverse order:
|
|
69
|
+
*
|
|
70
|
+
* coef[0] = C , ..., coef[N] = C .
|
|
71
|
+
* N 0
|
|
72
|
+
*
|
|
73
|
+
* The function p1evl() assumes that coef[N] = 1.0 and is
|
|
74
|
+
* omitted from the array. Its calling arguments are
|
|
75
|
+
* otherwise the same as polevl().
|
|
76
|
+
*
|
|
77
|
+
*
|
|
78
|
+
* The Eigen implementation is templatized. For best speed, store
|
|
79
|
+
* coef as a const array (constexpr), e.g.
|
|
80
|
+
*
|
|
81
|
+
* const double coef[] = {1.0, 2.0, 3.0, ...};
|
|
82
|
+
*
|
|
83
|
+
*/
|
|
84
|
+
template <typename Packet, int N>
|
|
85
|
+
struct ppolevl {
|
|
86
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
|
|
87
|
+
const typename unpacket_traits<Packet>::type coeff[]) {
|
|
88
|
+
EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
|
89
|
+
return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
template <typename Packet>
|
|
94
|
+
struct ppolevl<Packet, 0> {
|
|
95
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
|
|
96
|
+
const typename unpacket_traits<Packet>::type coeff[]) {
|
|
97
|
+
EIGEN_UNUSED_VARIABLE(x);
|
|
98
|
+
return pset1<Packet>(coeff[0]);
|
|
99
|
+
}
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
/* chbevl (modified for Eigen)
|
|
103
|
+
*
|
|
104
|
+
* Evaluate Chebyshev series
|
|
105
|
+
*
|
|
106
|
+
*
|
|
107
|
+
*
|
|
108
|
+
* SYNOPSIS:
|
|
109
|
+
*
|
|
110
|
+
* int N;
|
|
111
|
+
* Scalar x, y, coef[N], chebevl();
|
|
112
|
+
*
|
|
113
|
+
* y = chbevl( x, coef, N );
|
|
114
|
+
*
|
|
115
|
+
*
|
|
116
|
+
*
|
|
117
|
+
* DESCRIPTION:
|
|
118
|
+
*
|
|
119
|
+
* Evaluates the series
|
|
120
|
+
*
|
|
121
|
+
* N-1
|
|
122
|
+
* - '
|
|
123
|
+
* y = > coef[i] T (x/2)
|
|
124
|
+
* - i
|
|
125
|
+
* i=0
|
|
126
|
+
*
|
|
127
|
+
* of Chebyshev polynomials Ti at argument x/2.
|
|
128
|
+
*
|
|
129
|
+
* Coefficients are stored in reverse order, i.e. the zero
|
|
130
|
+
* order term is last in the array. Note N is the number of
|
|
131
|
+
* coefficients, not the order.
|
|
132
|
+
*
|
|
133
|
+
* If coefficients are for the interval a to b, x must
|
|
134
|
+
* have been transformed to x -> 2(2x - b - a)/(b-a) before
|
|
135
|
+
* entering the routine. This maps x from (a, b) to (-1, 1),
|
|
136
|
+
* over which the Chebyshev polynomials are defined.
|
|
137
|
+
*
|
|
138
|
+
* If the coefficients are for the inverted interval, in
|
|
139
|
+
* which (a, b) is mapped to (1/b, 1/a), the transformation
|
|
140
|
+
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
|
|
141
|
+
* this becomes x -> 4a/x - 1.
|
|
142
|
+
*
|
|
143
|
+
*
|
|
144
|
+
*
|
|
145
|
+
* SPEED:
|
|
146
|
+
*
|
|
147
|
+
* Taking advantage of the recurrence properties of the
|
|
148
|
+
* Chebyshev polynomials, the routine requires one more
|
|
149
|
+
* addition per loop than evaluating a nested polynomial of
|
|
150
|
+
* the same degree.
|
|
151
|
+
*
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
template <typename Packet, int N>
|
|
155
|
+
struct pchebevl {
|
|
156
|
+
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
|
|
157
|
+
const typename unpacket_traits<Packet>::type coef[]) {
|
|
158
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
159
|
+
Packet b0 = pset1<Packet>(coef[0]);
|
|
160
|
+
Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
|
|
161
|
+
Packet b2;
|
|
162
|
+
|
|
163
|
+
for (int i = 1; i < N; i++) {
|
|
164
|
+
b2 = b1;
|
|
165
|
+
b1 = b0;
|
|
166
|
+
b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
template <typename Packet>
|
|
174
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
|
|
31
175
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
32
176
|
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
|
33
|
-
|
|
177
|
+
static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
|
|
34
178
|
return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
|
|
35
179
|
}
|
|
36
180
|
|
|
37
181
|
// Safely applies frexp, correctly handles denormals.
|
|
38
182
|
// Assumes IEEE floating point format.
|
|
39
|
-
template<typename Packet>
|
|
40
|
-
Packet pfrexp_generic(const Packet& a, Packet& exponent) {
|
|
183
|
+
template <typename Packet>
|
|
184
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
|
|
41
185
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
42
186
|
typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
|
|
50
|
-
~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000
|
|
51
|
-
const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
|
|
187
|
+
static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
|
|
188
|
+
ExponentBits = TotalBits - MantissaBits - 1;
|
|
189
|
+
|
|
190
|
+
constexpr ScalarUI scalar_sign_mantissa_mask =
|
|
191
|
+
~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
|
|
192
|
+
const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
|
|
52
193
|
const Packet half = pset1<Packet>(Scalar(0.5));
|
|
53
194
|
const Packet zero = pzero(a);
|
|
54
|
-
const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());
|
|
55
|
-
|
|
195
|
+
const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
|
|
196
|
+
|
|
56
197
|
// To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
|
|
57
198
|
const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
|
|
58
|
-
|
|
199
|
+
constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
|
|
59
200
|
// The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
|
|
60
|
-
const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));
|
|
61
|
-
const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
|
|
201
|
+
const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
|
|
202
|
+
const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
|
|
62
203
|
const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
|
|
63
|
-
|
|
204
|
+
|
|
64
205
|
// Determine exponent offset: -126 if normal, -126-24 if denormal
|
|
65
|
-
const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(
|
|
206
|
+
const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2)); // -126
|
|
66
207
|
Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
|
|
67
|
-
const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));
|
|
208
|
+
const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
|
|
68
209
|
exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
|
|
69
|
-
|
|
210
|
+
|
|
70
211
|
// Determine exponent and mantissa from normalized_a.
|
|
71
212
|
exponent = pfrexp_generic_get_biased_exponent(normalized_a);
|
|
72
213
|
// Zero, Inf and NaN return 'a' unmodified, exponent is zero
|
|
73
214
|
// (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
|
|
74
|
-
const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) <<
|
|
215
|
+
const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255
|
|
75
216
|
const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
|
|
76
217
|
const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
|
|
77
218
|
const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
|
|
78
|
-
exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
|
|
219
|
+
exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
|
|
79
220
|
return m;
|
|
80
221
|
}
|
|
81
222
|
|
|
82
223
|
// Safely applies ldexp, correctly handles overflows, underflows and denormals.
|
|
83
224
|
// Assumes IEEE floating point format.
|
|
84
|
-
template<typename Packet>
|
|
85
|
-
Packet pldexp_generic(const Packet& a, const Packet& exponent) {
|
|
225
|
+
template <typename Packet>
|
|
226
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
|
|
86
227
|
// We want to return a * 2^exponent, allowing for all possible integer
|
|
87
228
|
// exponents without overflowing or underflowing in intermediate
|
|
88
229
|
// computations.
|
|
@@ -91,7 +232,7 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) {
|
|
|
91
232
|
// to consider for a float is:
|
|
92
233
|
// -255-23 -> 255+23
|
|
93
234
|
// Below -278 any finite float 'a' will become zero, and above +278 any
|
|
94
|
-
// finite float will become inf, including when 'a' is the smallest possible
|
|
235
|
+
// finite float will become inf, including when 'a' is the smallest possible
|
|
95
236
|
// denormal.
|
|
96
237
|
//
|
|
97
238
|
// Unfortunately, 2^(278) cannot be represented using either one or two
|
|
@@ -108,25 +249,22 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) {
|
|
|
108
249
|
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
|
109
250
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
110
251
|
typedef typename unpacket_traits<PacketI>::type ScalarI;
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) + ScalarI(int(MantissaBits) - 1))); // 278
|
|
118
|
-
const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127
|
|
252
|
+
static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
|
|
253
|
+
ExponentBits = TotalBits - MantissaBits - 1;
|
|
254
|
+
|
|
255
|
+
const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1))); // 278
|
|
256
|
+
const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)); // 127
|
|
119
257
|
const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
|
|
120
|
-
PacketI b = parithmetic_shift_right<2>(e);
|
|
121
|
-
Packet c = preinterpret<Packet>(plogical_shift_left<
|
|
122
|
-
Packet out = pmul(pmul(pmul(a, c), c), c);
|
|
123
|
-
b =
|
|
124
|
-
c = preinterpret<Packet>(plogical_shift_left<
|
|
258
|
+
PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
|
|
259
|
+
Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^b
|
|
260
|
+
Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
|
|
261
|
+
b = pnmadd(pset1<PacketI>(3), b, e); // e - 3b
|
|
262
|
+
c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^(e-3*b)
|
|
125
263
|
out = pmul(out, c);
|
|
126
264
|
return out;
|
|
127
265
|
}
|
|
128
266
|
|
|
129
|
-
// Explicitly multiplies
|
|
267
|
+
// Explicitly multiplies
|
|
130
268
|
// a * (2^e)
|
|
131
269
|
// clamping e to the range
|
|
132
270
|
// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
|
|
@@ -135,27 +273,157 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) {
|
|
|
135
273
|
// if 2^e doesn't fit into a normal floating-point Scalar.
|
|
136
274
|
//
|
|
137
275
|
// Assumes IEEE floating point format
|
|
138
|
-
template<typename Packet>
|
|
139
|
-
|
|
276
|
+
template <typename Packet>
|
|
277
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
|
|
140
278
|
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
|
141
279
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
142
280
|
typedef typename unpacket_traits<PacketI>::type ScalarI;
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
281
|
+
static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
|
|
282
|
+
ExponentBits = TotalBits - MantissaBits - 1;
|
|
283
|
+
|
|
284
|
+
const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127
|
|
285
|
+
const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255
|
|
286
|
+
// restrict biased exponent between 0 and 255 for float.
|
|
287
|
+
const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
|
|
288
|
+
// return a * (2^e)
|
|
289
|
+
return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// This function implements a single step of Halley's iteration for
|
|
293
|
+
// computing x = y^(1/3):
|
|
294
|
+
// x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
|
|
295
|
+
template <typename Packet>
|
|
296
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
|
|
297
|
+
const Packet& y) {
|
|
298
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
299
|
+
Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
|
|
300
|
+
Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
|
|
301
|
+
Packet num = psub(x_k_cb, y);
|
|
302
|
+
Packet r = pdiv(num, denom);
|
|
303
|
+
return pnmadd(x_k, r, x_k);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
|
|
307
|
+
// interval [0.125,1].
|
|
308
|
+
template <typename Packet>
|
|
309
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
|
|
310
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
311
|
+
// Extract the significant s in the range [0.5,1) and exponent e, such that
|
|
312
|
+
// x = 2^e * s.
|
|
313
|
+
Packet e, s;
|
|
314
|
+
s = pfrexp(x, e);
|
|
315
|
+
|
|
316
|
+
// Split the exponent into a part divisible by 3 and the remainder.
|
|
317
|
+
// e = 3*e_div3 + e_mod3.
|
|
318
|
+
constexpr Scalar kOneThird = Scalar(1) / 3;
|
|
319
|
+
e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
|
|
320
|
+
Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
|
|
321
|
+
|
|
322
|
+
// Replace s by y = (s * 2^e_mod3).
|
|
323
|
+
return pldexp_fast(s, e_mod3);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
template <typename Packet>
|
|
327
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
|
|
328
|
+
const Packet& abs_root) {
|
|
329
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
330
|
+
|
|
331
|
+
// Set sign.
|
|
332
|
+
const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
|
|
333
|
+
const Packet x_sign = pand(sign_mask, x);
|
|
334
|
+
Packet root = por(x_sign, abs_root);
|
|
335
|
+
|
|
336
|
+
// Pass non-finite and zero values of x straight through.
|
|
337
|
+
const Packet is_not_finite = por(pisinf(x), pisnan(x));
|
|
338
|
+
const Packet is_zero = pcmp_eq(pzero(x), x);
|
|
339
|
+
const Packet use_x = por(is_not_finite, is_zero);
|
|
340
|
+
return pselect(use_x, x, root);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Generic implementation of cbrt(x) for float.
|
|
344
|
+
//
|
|
345
|
+
// The algorithm computes the cubic root of the input by first
|
|
346
|
+
// decomposing it into a exponent and significant
|
|
347
|
+
// x = s * 2^e.
|
|
348
|
+
//
|
|
349
|
+
// We can then write the cube root as
|
|
350
|
+
//
|
|
351
|
+
// x^(1/3) = 2^(e/3) * s^(1/3)
|
|
352
|
+
// = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
|
|
353
|
+
// = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
|
|
354
|
+
// = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
|
|
355
|
+
//
|
|
356
|
+
// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
|
|
357
|
+
//
|
|
358
|
+
// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
|
|
359
|
+
// approximated using a cubic polynomial and subsequently refined using a
|
|
360
|
+
// single step of Halley's iteration, and finally the two terms are combined
|
|
361
|
+
// using pldexp_fast.
|
|
362
|
+
//
|
|
363
|
+
// Note: Many alternatives exist for implementing cbrt. See, for example,
|
|
364
|
+
// the excellent discussion in Kahan's note:
|
|
365
|
+
// https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
|
|
366
|
+
// This particular implementation was found to be very fast and accurate
|
|
367
|
+
// among several alternatives tried, but is probably not "optimal" on all
|
|
368
|
+
// platforms.
|
|
369
|
+
//
|
|
370
|
+
// This is accurate to 2 ULP.
|
|
371
|
+
template <typename Packet>
|
|
372
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
|
|
373
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
374
|
+
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
|
375
|
+
|
|
376
|
+
// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
|
|
377
|
+
// interval [0.125,1].
|
|
378
|
+
Packet e_div3;
|
|
379
|
+
const Packet y = cbrt_decompose(pabs(x), e_div3);
|
|
380
|
+
|
|
381
|
+
// Compute initial approximation accurate to 5.22e-3.
|
|
382
|
+
// The polynomial was computed using Rminimax.
|
|
383
|
+
constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
|
|
384
|
+
3.408401906490325927734375e-01f};
|
|
385
|
+
Packet r = ppolevl<Packet, 3>::run(y, alpha);
|
|
386
|
+
|
|
387
|
+
// Take one step of Halley's iteration.
|
|
388
|
+
r = cbrt_halley_iteration_step(r, y);
|
|
389
|
+
|
|
390
|
+
// Finally multiply by 2^(e_div3)
|
|
391
|
+
r = pldexp_fast(r, e_div3);
|
|
392
|
+
|
|
393
|
+
return cbrt_special_cases_and_sign(x, r);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Generic implementation of cbrt(x) for double.
|
|
397
|
+
//
|
|
398
|
+
// The algorithm is identical to the one for float except that a different initial
|
|
399
|
+
// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
|
|
400
|
+
//
|
|
401
|
+
// This is accurate to 1 ULP.
|
|
402
|
+
template <typename Packet>
|
|
403
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
|
|
404
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
405
|
+
static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
|
|
406
|
+
|
|
407
|
+
// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
|
|
408
|
+
// interval [0.125,1].
|
|
409
|
+
Packet e_div3;
|
|
410
|
+
const Packet y = cbrt_decompose(pabs(x), e_div3);
|
|
411
|
+
|
|
412
|
+
// Compute initial approximation accurate to 0.016.
|
|
413
|
+
// The polynomial was computed using Rminimax.
|
|
414
|
+
constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
|
|
415
|
+
1.072314636518546304699839311069808900356292724609375e+00,
|
|
416
|
+
3.81249427609571867048288140722434036433696746826171875e-01};
|
|
417
|
+
Packet r = ppolevl<Packet, 2>::run(y, alpha);
|
|
418
|
+
|
|
419
|
+
// Take two steps of Halley's iteration.
|
|
420
|
+
r = cbrt_halley_iteration_step(r, y);
|
|
421
|
+
r = cbrt_halley_iteration_step(r, y);
|
|
422
|
+
|
|
423
|
+
// Finally multiply by 2^(e_div3).
|
|
424
|
+
r = pldexp_fast(r, e_div3);
|
|
425
|
+
return cbrt_special_cases_and_sign(x, r);
|
|
426
|
+
}
|
|
159
427
|
|
|
160
428
|
// Natural or base 2 logarithm.
|
|
161
429
|
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
|
@@ -164,37 +432,15 @@ struct pldexp_fast_impl {
|
|
|
164
432
|
// TODO(gonnet): Further reduce the interval allowing for lower-degree
|
|
165
433
|
// polynomial interpolants -> ... -> profit!
|
|
166
434
|
template <typename Packet, bool base2>
|
|
167
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
168
|
-
|
|
169
|
-
Packet
|
|
170
|
-
|
|
171
|
-
Packet x = _x;
|
|
172
|
-
|
|
173
|
-
const Packet cst_1 = pset1<Packet>(1.0f);
|
|
174
|
-
const Packet cst_neg_half = pset1<Packet>(-0.5f);
|
|
175
|
-
// The smallest non denormalized float number.
|
|
176
|
-
const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
|
|
177
|
-
const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
|
|
178
|
-
const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u);
|
|
435
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
|
|
436
|
+
const Packet cst_1 = pset1<Packet>(1.0f);
|
|
437
|
+
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
|
|
438
|
+
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
|
|
179
439
|
|
|
180
|
-
// Polynomial coefficients.
|
|
181
440
|
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
|
|
182
|
-
|
|
183
|
-
const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
|
|
184
|
-
const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
|
|
185
|
-
const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
|
|
186
|
-
const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
|
|
187
|
-
const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
|
|
188
|
-
const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
|
|
189
|
-
const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
|
|
190
|
-
const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
|
|
191
|
-
|
|
192
|
-
// Truncate input values to the minimum positive normal.
|
|
193
|
-
x = pmax(x, cst_min_norm_pos);
|
|
194
|
-
|
|
195
|
-
Packet e;
|
|
441
|
+
Packet e, x;
|
|
196
442
|
// extract significant in the range [0.5,1) and exponent
|
|
197
|
-
x = pfrexp(
|
|
443
|
+
x = pfrexp(_x, e);
|
|
198
444
|
|
|
199
445
|
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
|
200
446
|
// and shift by -1. The values are then centered around 0, which improves
|
|
@@ -209,24 +455,15 @@ Packet plog_impl_float(const Packet _x)
|
|
|
209
455
|
e = psub(e, pand(cst_1, mask));
|
|
210
456
|
x = padd(x, tmp);
|
|
211
457
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
// to improve instruction-level parallelism.
|
|
217
|
-
Packet y, y1, y2;
|
|
218
|
-
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
|
219
|
-
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
|
220
|
-
y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
|
|
221
|
-
y = pmadd(y, x, cst_cephes_log_p2);
|
|
222
|
-
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
|
223
|
-
y2 = pmadd(y2, x, cst_cephes_log_p8);
|
|
224
|
-
y = pmadd(y, x3, y1);
|
|
225
|
-
y = pmadd(y, x3, y2);
|
|
226
|
-
y = pmul(y, x3);
|
|
458
|
+
// Polynomial coefficients for rational r(x) = p(x)/q(x)
|
|
459
|
+
// approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
|
|
460
|
+
constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
|
|
461
|
+
constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
|
|
227
462
|
|
|
228
|
-
|
|
229
|
-
|
|
463
|
+
Packet p = ppolevl<Packet, 2>::run(x, alpha);
|
|
464
|
+
p = pmul(x, p);
|
|
465
|
+
Packet q = ppolevl<Packet, 3>::run(x, beta);
|
|
466
|
+
x = pdiv(p, q);
|
|
230
467
|
|
|
231
468
|
// Add the logarithm of the exponent back to the result of the interpolation.
|
|
232
469
|
if (base2) {
|
|
@@ -238,29 +475,22 @@ Packet plog_impl_float(const Packet _x)
|
|
|
238
475
|
}
|
|
239
476
|
|
|
240
477
|
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
|
|
241
|
-
Packet iszero_mask
|
|
242
|
-
Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
|
|
478
|
+
Packet iszero_mask = pcmp_eq(_x, pzero(_x));
|
|
479
|
+
Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
|
|
243
480
|
// Filter out invalid inputs, i.e.:
|
|
244
481
|
// - negative arg will be NAN
|
|
245
482
|
// - 0 will be -INF
|
|
246
483
|
// - +INF will be +INF
|
|
247
|
-
return pselect(iszero_mask, cst_minus_inf,
|
|
248
|
-
por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
|
|
484
|
+
return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
|
|
249
485
|
}
|
|
250
486
|
|
|
251
487
|
template <typename Packet>
|
|
252
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
253
|
-
EIGEN_UNUSED
|
|
254
|
-
Packet plog_float(const Packet _x)
|
|
255
|
-
{
|
|
488
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) {
|
|
256
489
|
return plog_impl_float<Packet, /* base2 */ false>(_x);
|
|
257
490
|
}
|
|
258
491
|
|
|
259
492
|
template <typename Packet>
|
|
260
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
261
|
-
EIGEN_UNUSED
|
|
262
|
-
Packet plog2_float(const Packet _x)
|
|
263
|
-
{
|
|
493
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) {
|
|
264
494
|
return plog_impl_float<Packet, /* base2 */ true>(_x);
|
|
265
495
|
}
|
|
266
496
|
|
|
@@ -274,22 +504,16 @@ Packet plog2_float(const Packet _x)
|
|
|
274
504
|
* for more detail see: http://www.netlib.org/cephes/
|
|
275
505
|
*/
|
|
276
506
|
template <typename Packet, bool base2>
|
|
277
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
278
|
-
EIGEN_UNUSED
|
|
279
|
-
Packet plog_impl_double(const Packet _x)
|
|
280
|
-
{
|
|
507
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
|
|
281
508
|
Packet x = _x;
|
|
282
509
|
|
|
283
|
-
const Packet cst_1
|
|
284
|
-
const Packet cst_neg_half
|
|
285
|
-
|
|
286
|
-
const Packet
|
|
287
|
-
const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
|
|
288
|
-
const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
|
|
289
|
-
|
|
510
|
+
const Packet cst_1 = pset1<Packet>(1.0);
|
|
511
|
+
const Packet cst_neg_half = pset1<Packet>(-0.5);
|
|
512
|
+
const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
|
|
513
|
+
const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
|
290
514
|
|
|
291
|
-
|
|
292
|
-
|
|
515
|
+
// Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
|
|
516
|
+
// 1/sqrt(2) <= x < sqrt(2)
|
|
293
517
|
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
|
|
294
518
|
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
|
|
295
519
|
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
|
|
@@ -305,13 +529,10 @@ Packet plog_impl_double(const Packet _x)
|
|
|
305
529
|
const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
|
|
306
530
|
const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
|
|
307
531
|
|
|
308
|
-
// Truncate input values to the minimum positive normal.
|
|
309
|
-
x = pmax(x, cst_min_norm_pos);
|
|
310
|
-
|
|
311
532
|
Packet e;
|
|
312
533
|
// extract significant in the range [0.5,1) and exponent
|
|
313
|
-
x = pfrexp(x,e);
|
|
314
|
-
|
|
534
|
+
x = pfrexp(x, e);
|
|
535
|
+
|
|
315
536
|
// Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
|
316
537
|
// and shift by -1. The values are then centered around 0, which improves
|
|
317
538
|
// the stability of the polynomial evaluation.
|
|
@@ -331,20 +552,20 @@ Packet plog_impl_double(const Packet _x)
|
|
|
331
552
|
// Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
|
|
332
553
|
// y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
|
|
333
554
|
Packet y, y1, y_;
|
|
334
|
-
y
|
|
555
|
+
y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
|
|
335
556
|
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
|
|
336
|
-
y
|
|
557
|
+
y = pmadd(y, x, cst_cephes_log_p2);
|
|
337
558
|
y1 = pmadd(y1, x, cst_cephes_log_p5);
|
|
338
559
|
y_ = pmadd(y, x3, y1);
|
|
339
560
|
|
|
340
|
-
y
|
|
561
|
+
y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
|
|
341
562
|
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
|
|
342
|
-
y
|
|
563
|
+
y = pmadd(y, x, cst_cephes_log_q2);
|
|
343
564
|
y1 = pmadd(y1, x, cst_cephes_log_q5);
|
|
344
|
-
y
|
|
565
|
+
y = pmadd(y, x3, y1);
|
|
345
566
|
|
|
346
567
|
y_ = pmul(y_, x3);
|
|
347
|
-
y
|
|
568
|
+
y = pdiv(y_, y);
|
|
348
569
|
|
|
349
570
|
y = pmadd(cst_neg_half, x2, y);
|
|
350
571
|
x = padd(x, y);
|
|
@@ -359,38 +580,30 @@ Packet plog_impl_double(const Packet _x)
|
|
|
359
580
|
}
|
|
360
581
|
|
|
361
582
|
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
|
|
362
|
-
Packet iszero_mask
|
|
363
|
-
Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
|
|
583
|
+
Packet iszero_mask = pcmp_eq(_x, pzero(_x));
|
|
584
|
+
Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
|
|
364
585
|
// Filter out invalid inputs, i.e.:
|
|
365
586
|
// - negative arg will be NAN
|
|
366
587
|
// - 0 will be -INF
|
|
367
588
|
// - +INF will be +INF
|
|
368
|
-
return pselect(iszero_mask, cst_minus_inf,
|
|
369
|
-
por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
|
|
589
|
+
return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
|
|
370
590
|
}
|
|
371
591
|
|
|
372
592
|
template <typename Packet>
|
|
373
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
374
|
-
EIGEN_UNUSED
|
|
375
|
-
Packet plog_double(const Packet _x)
|
|
376
|
-
{
|
|
593
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) {
|
|
377
594
|
return plog_impl_double<Packet, /* base2 */ false>(_x);
|
|
378
595
|
}
|
|
379
596
|
|
|
380
597
|
template <typename Packet>
|
|
381
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
382
|
-
EIGEN_UNUSED
|
|
383
|
-
Packet plog2_double(const Packet _x)
|
|
384
|
-
{
|
|
598
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) {
|
|
385
599
|
return plog_impl_double<Packet, /* base2 */ true>(_x);
|
|
386
600
|
}
|
|
387
601
|
|
|
388
602
|
/** \internal \returns log(1 + x) computed using W. Kahan's formula.
|
|
389
603
|
See: http://www.plunk.org/~hatch/rightway.php
|
|
390
604
|
*/
|
|
391
|
-
template<typename Packet>
|
|
392
|
-
Packet
|
|
393
|
-
{
|
|
605
|
+
template <typename Packet>
|
|
606
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x) {
|
|
394
607
|
typedef typename unpacket_traits<Packet>::type ScalarType;
|
|
395
608
|
const Packet one = pset1<Packet>(ScalarType(1));
|
|
396
609
|
Packet xp1 = padd(x, one);
|
|
@@ -404,9 +617,8 @@ Packet generic_plog1p(const Packet& x)
|
|
|
404
617
|
/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
|
|
405
618
|
See: http://www.plunk.org/~hatch/rightway.php
|
|
406
619
|
*/
|
|
407
|
-
template<typename Packet>
|
|
408
|
-
Packet generic_expm1(const Packet& x)
|
|
409
|
-
{
|
|
620
|
+
template <typename Packet>
|
|
621
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x) {
|
|
410
622
|
typedef typename unpacket_traits<Packet>::type ScalarType;
|
|
411
623
|
const Packet one = pset1<Packet>(ScalarType(1));
|
|
412
624
|
const Packet neg_one = pset1<Packet>(ScalarType(-1));
|
|
@@ -422,37 +634,32 @@ Packet generic_expm1(const Packet& x)
|
|
|
422
634
|
Packet pos_inf_mask = pcmp_eq(logu, u);
|
|
423
635
|
Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
|
|
424
636
|
expm1 = pselect(pos_inf_mask, u, expm1);
|
|
425
|
-
return pselect(one_mask,
|
|
426
|
-
x,
|
|
427
|
-
pselect(neg_one_mask,
|
|
428
|
-
neg_one,
|
|
429
|
-
expm1));
|
|
637
|
+
return pselect(one_mask, x, pselect(neg_one_mask, neg_one, expm1));
|
|
430
638
|
}
|
|
431
639
|
|
|
432
|
-
|
|
433
640
|
// Exponential function. Works by writing "x = m*log(2) + r" where
|
|
434
641
|
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
|
|
435
642
|
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
|
|
643
|
+
// exp(r) is computed using a 6th order minimax polynomial approximation.
|
|
436
644
|
template <typename Packet>
|
|
437
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
438
|
-
|
|
439
|
-
Packet
|
|
440
|
-
|
|
441
|
-
const Packet
|
|
442
|
-
const Packet
|
|
443
|
-
const Packet
|
|
444
|
-
const Packet cst_exp_lo = pset1<Packet>(-88.723f);
|
|
645
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
|
|
646
|
+
const Packet cst_zero = pset1<Packet>(0.0f);
|
|
647
|
+
const Packet cst_one = pset1<Packet>(1.0f);
|
|
648
|
+
const Packet cst_half = pset1<Packet>(0.5f);
|
|
649
|
+
const Packet cst_exp_hi = pset1<Packet>(88.723f);
|
|
650
|
+
const Packet cst_exp_lo = pset1<Packet>(-104.f);
|
|
651
|
+
const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
|
|
445
652
|
|
|
446
653
|
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
|
|
447
|
-
const Packet
|
|
448
|
-
const Packet
|
|
449
|
-
const Packet
|
|
450
|
-
const Packet
|
|
451
|
-
const Packet
|
|
452
|
-
const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
|
|
654
|
+
const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
|
|
655
|
+
const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
|
|
656
|
+
const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
|
|
657
|
+
const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
|
|
658
|
+
const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
|
|
453
659
|
|
|
454
660
|
// Clamp x.
|
|
455
|
-
Packet
|
|
661
|
+
Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
|
|
662
|
+
Packet x = pmin(_x, cst_exp_hi);
|
|
456
663
|
|
|
457
664
|
// Express exp(x) as exp(m*ln(2) + r), start by extracting
|
|
458
665
|
// m = floor(x/ln(2) + 0.5).
|
|
@@ -466,38 +673,37 @@ Packet pexp_float(const Packet _x)
|
|
|
466
673
|
Packet r = pmadd(m, cst_cephes_exp_C1, x);
|
|
467
674
|
r = pmadd(m, cst_cephes_exp_C2, r);
|
|
468
675
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
Packet
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
y
|
|
478
|
-
y1 = pmadd(y1, r, cst_cephes_exp_p5);
|
|
479
|
-
y = pmadd(y, r3, y1);
|
|
480
|
-
y = pmadd(y, r2, y2);
|
|
676
|
+
// Evaluate the 6th order polynomial approximation to exp(r)
|
|
677
|
+
// with r in the interval [-ln(2)/2;ln(2)/2].
|
|
678
|
+
const Packet r2 = pmul(r, r);
|
|
679
|
+
Packet p_even = pmadd(r2, cst_p6, cst_p4);
|
|
680
|
+
const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
|
|
681
|
+
p_even = pmadd(r2, p_even, cst_p2);
|
|
682
|
+
const Packet p_low = padd(r, cst_one);
|
|
683
|
+
Packet y = pmadd(r, p_odd, p_even);
|
|
684
|
+
y = pmadd(r2, y, p_low);
|
|
481
685
|
|
|
482
686
|
// Return 2^m * exp(r).
|
|
483
|
-
|
|
484
|
-
|
|
687
|
+
const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
|
|
688
|
+
if (!predux_any(fast_pldexp_unsafe)) {
|
|
689
|
+
// For |x| <= 87, we know the result is not zero or inf, and we can safely use
|
|
690
|
+
// the fast version of pldexp.
|
|
691
|
+
return pmax(pldexp_fast(y, m), _x);
|
|
692
|
+
}
|
|
693
|
+
return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
|
|
485
694
|
}
|
|
486
695
|
|
|
487
696
|
template <typename Packet>
|
|
488
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
489
|
-
EIGEN_UNUSED
|
|
490
|
-
Packet pexp_double(const Packet _x)
|
|
491
|
-
{
|
|
697
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
|
|
492
698
|
Packet x = _x;
|
|
493
|
-
|
|
699
|
+
const Packet cst_zero = pset1<Packet>(0.0);
|
|
494
700
|
const Packet cst_1 = pset1<Packet>(1.0);
|
|
495
701
|
const Packet cst_2 = pset1<Packet>(2.0);
|
|
496
702
|
const Packet cst_half = pset1<Packet>(0.5);
|
|
497
703
|
|
|
498
704
|
const Packet cst_exp_hi = pset1<Packet>(709.784);
|
|
499
|
-
const Packet cst_exp_lo = pset1<Packet>(-
|
|
500
|
-
|
|
705
|
+
const Packet cst_exp_lo = pset1<Packet>(-745.519);
|
|
706
|
+
const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
|
|
501
707
|
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
|
|
502
708
|
const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
|
|
503
709
|
const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
|
|
@@ -512,7 +718,8 @@ Packet pexp_double(const Packet _x)
|
|
|
512
718
|
Packet tmp, fx;
|
|
513
719
|
|
|
514
720
|
// clamp x
|
|
515
|
-
|
|
721
|
+
Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
|
|
722
|
+
x = pmin(x, cst_exp_hi);
|
|
516
723
|
// Express exp(x) as exp(g + n*log(2)).
|
|
517
724
|
fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
|
|
518
725
|
|
|
@@ -549,8 +756,13 @@ Packet pexp_double(const Packet _x)
|
|
|
549
756
|
|
|
550
757
|
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
|
|
551
758
|
// non-finite values in the input.
|
|
552
|
-
|
|
553
|
-
|
|
759
|
+
const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
|
|
760
|
+
if (!predux_any(fast_pldexp_unsafe)) {
|
|
761
|
+
// For |x| <= 708, we know the result is not zero or inf, and we can safely use
|
|
762
|
+
// the fast version of pldexp.
|
|
763
|
+
return pmax(pldexp_fast(x, fx), _x);
|
|
764
|
+
}
|
|
765
|
+
return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
|
|
554
766
|
}
|
|
555
767
|
|
|
556
768
|
// The following code is inspired by the following stack-overflow answer:
|
|
@@ -562,29 +774,22 @@ Packet pexp_double(const Packet _x)
|
|
|
562
774
|
// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
|
|
563
775
|
// - Avoid a branch in rounding and extraction of the remaining fractional part.
|
|
564
776
|
// Overall, I measured a speed up higher than x2 on x86-64.
|
|
565
|
-
inline float trig_reduce_huge
|
|
566
|
-
{
|
|
777
|
+
inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
|
|
567
778
|
using Eigen::numext::int32_t;
|
|
568
|
-
using Eigen::numext::uint32_t;
|
|
569
779
|
using Eigen::numext::int64_t;
|
|
780
|
+
using Eigen::numext::uint32_t;
|
|
570
781
|
using Eigen::numext::uint64_t;
|
|
571
782
|
|
|
572
|
-
const double pio2_62 = 3.4061215800865545e-19;
|
|
573
|
-
const uint64_t zero_dot_five = uint64_t(1) << 61;
|
|
783
|
+
const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
|
|
784
|
+
const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
|
|
574
785
|
|
|
575
786
|
// 192 bits of 2/pi for Payne-Hanek reduction
|
|
576
787
|
// Bits are introduced by packet of 8 to enable aligned reads.
|
|
577
|
-
static const uint32_t two_over_pi
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
|
|
583
|
-
0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
|
|
584
|
-
0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
|
|
585
|
-
0x10e41000, 0xe4100000
|
|
586
|
-
};
|
|
587
|
-
|
|
788
|
+
static const uint32_t two_over_pi[] = {
|
|
789
|
+
0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
|
|
790
|
+
0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
|
|
791
|
+
0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
|
|
792
|
+
|
|
588
793
|
uint32_t xi = numext::bit_cast<uint32_t>(xf);
|
|
589
794
|
// Below, -118 = -126 + 8.
|
|
590
795
|
// -126 is to get the exponent,
|
|
@@ -592,12 +797,12 @@ inline float trig_reduce_huge (float xf, int *quadrant)
|
|
|
592
797
|
// This is possible because the fractional part of x as only 24 meaningful bits.
|
|
593
798
|
uint32_t e = (xi >> 23) - 118;
|
|
594
799
|
// Extract the mantissa and shift it to align it wrt the exponent
|
|
595
|
-
xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
|
|
800
|
+
xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
|
|
596
801
|
|
|
597
802
|
uint32_t i = e >> 3;
|
|
598
|
-
uint32_t twoopi_1
|
|
599
|
-
uint32_t twoopi_2
|
|
600
|
-
uint32_t twoopi_3
|
|
803
|
+
uint32_t twoopi_1 = two_over_pi[i - 1];
|
|
804
|
+
uint32_t twoopi_2 = two_over_pi[i + 3];
|
|
805
|
+
uint32_t twoopi_3 = two_over_pi[i + 7];
|
|
601
806
|
|
|
602
807
|
// Compute x * 2/pi in 2.62-bit fixed-point format.
|
|
603
808
|
uint64_t p;
|
|
@@ -612,46 +817,45 @@ inline float trig_reduce_huge (float xf, int *quadrant)
|
|
|
612
817
|
// since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
|
|
613
818
|
// r = (p-q)*pi/2,
|
|
614
819
|
// where the product can be be carried out with sufficient accuracy using double precision.
|
|
615
|
-
p -= q<<62;
|
|
820
|
+
p -= q << 62;
|
|
616
821
|
return float(double(int64_t(p)) * pio2_62);
|
|
617
822
|
}
|
|
618
823
|
|
|
619
|
-
template<bool ComputeSine,typename Packet>
|
|
824
|
+
template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
|
|
620
825
|
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
__attribute__((optimize("-fno-unsafe-math-optimizations")))
|
|
826
|
+
#if EIGEN_COMP_GNUC_STRICT
|
|
827
|
+
__attribute__((optimize("-fno-unsafe-math-optimizations")))
|
|
624
828
|
#endif
|
|
625
|
-
Packet
|
|
626
|
-
{
|
|
829
|
+
Packet
|
|
830
|
+
psincos_float(const Packet& _x) {
|
|
627
831
|
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
|
628
832
|
|
|
629
|
-
const Packet
|
|
630
|
-
const Packet
|
|
631
|
-
const PacketI csti_1
|
|
632
|
-
const Packet
|
|
833
|
+
const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
|
|
834
|
+
const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
|
|
835
|
+
const PacketI csti_1 = pset1<PacketI>(1);
|
|
836
|
+
const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
|
|
633
837
|
|
|
634
838
|
Packet x = pabs(_x);
|
|
635
839
|
|
|
636
840
|
// Scale x by 2/Pi to find x's octant.
|
|
637
841
|
Packet y = pmul(x, cst_2oPI);
|
|
638
842
|
|
|
639
|
-
// Rounding trick:
|
|
843
|
+
// Rounding trick to find nearest integer:
|
|
640
844
|
Packet y_round = padd(y, cst_rounding_magic);
|
|
641
845
|
EIGEN_OPTIMIZATION_BARRIER(y_round)
|
|
642
|
-
PacketI y_int = preinterpret<PacketI>(y_round);
|
|
643
|
-
y = psub(y_round, cst_rounding_magic);
|
|
846
|
+
PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
|
|
847
|
+
y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
|
|
644
848
|
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
// This version requires true FMA for high accuracy
|
|
849
|
+
// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
|
|
850
|
+
// using "Extended precision modular arithmetic"
|
|
851
|
+
#if defined(EIGEN_VECTORIZE_FMA)
|
|
852
|
+
// This version requires true FMA for high accuracy.
|
|
649
853
|
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
|
|
650
854
|
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
|
|
651
855
|
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
|
|
652
856
|
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
|
|
653
857
|
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
|
|
654
|
-
|
|
858
|
+
#else
|
|
655
859
|
// Without true FMA, the previous set of coefficients maintain 1ULP accuracy
|
|
656
860
|
// up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
|
|
657
861
|
// We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
|
|
@@ -659,41 +863,38 @@ Packet psincos_float(const Packet& _x)
|
|
|
659
863
|
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
|
|
660
864
|
// and 2 ULP up to:
|
|
661
865
|
const float huge_th = ComputeSine ? 25966.f : 18838.f;
|
|
662
|
-
x = pmadd(y, pset1<Packet>(-1.5703125), x);
|
|
866
|
+
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
|
|
663
867
|
EIGEN_OPTIMIZATION_BARRIER(x)
|
|
664
|
-
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);
|
|
868
|
+
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
|
|
665
869
|
EIGEN_OPTIMIZATION_BARRIER(x)
|
|
666
|
-
x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);
|
|
667
|
-
x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
|
|
684
|
-
{
|
|
870
|
+
x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
|
|
871
|
+
x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
|
|
872
|
+
|
|
873
|
+
// For the record, the following set of coefficients maintain 2ULP up
|
|
874
|
+
// to a slightly larger range:
|
|
875
|
+
// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
|
|
876
|
+
// but it slightly fails to maintain 1ULP for two values of sin below pi.
|
|
877
|
+
// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
|
|
878
|
+
// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
|
|
879
|
+
// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
|
|
880
|
+
// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
|
|
881
|
+
|
|
882
|
+
// For the record, with only 3 iterations it is possible to maintain
|
|
883
|
+
// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
|
|
884
|
+
// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
|
|
885
|
+
#endif
|
|
886
|
+
|
|
887
|
+
if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
|
|
685
888
|
const int PacketSize = unpacket_traits<Packet>::size;
|
|
686
889
|
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
|
|
687
890
|
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
|
|
688
|
-
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet))
|
|
891
|
+
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
|
|
689
892
|
pstoreu(vals, pabs(_x));
|
|
690
893
|
pstoreu(x_cpy, x);
|
|
691
894
|
pstoreu(y_int2, y_int);
|
|
692
|
-
for(int k=0; k<PacketSize
|
|
693
|
-
{
|
|
895
|
+
for (int k = 0; k < PacketSize; ++k) {
|
|
694
896
|
float val = vals[k];
|
|
695
|
-
if(val>=huge_th && (numext::isfinite)(val))
|
|
696
|
-
x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
|
|
897
|
+
if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
|
|
697
898
|
}
|
|
698
899
|
x = ploadu<Packet>(x_cpy);
|
|
699
900
|
y_int = ploadu<PacketI>(y_int2);
|
|
@@ -703,19 +904,19 @@ Packet psincos_float(const Packet& _x)
|
|
|
703
904
|
// sin: sign = second_bit(y_int) xor signbit(_x)
|
|
704
905
|
// cos: sign = second_bit(y_int+1)
|
|
705
906
|
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
|
|
706
|
-
: preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
|
|
707
|
-
sign_bit = pand(sign_bit, cst_sign_mask);
|
|
907
|
+
: preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
|
|
908
|
+
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
|
708
909
|
|
|
709
910
|
// Get the polynomial selection mask from the second bit of y_int
|
|
710
911
|
// We'll calculate both (sin and cos) polynomials and then select from the two.
|
|
711
912
|
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
|
|
712
913
|
|
|
713
|
-
Packet x2 = pmul(x,x);
|
|
914
|
+
Packet x2 = pmul(x, x);
|
|
714
915
|
|
|
715
916
|
// Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
|
|
716
|
-
Packet y1 =
|
|
717
|
-
y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f
|
|
718
|
-
y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f
|
|
917
|
+
Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
|
|
918
|
+
y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
|
|
919
|
+
y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
|
|
719
920
|
y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
|
|
720
921
|
y1 = pmadd(y1, x2, pset1<Packet>(1.f));
|
|
721
922
|
|
|
@@ -727,66 +928,646 @@ Packet psincos_float(const Packet& _x)
|
|
|
727
928
|
// c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
|
|
728
929
|
// printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
|
|
729
930
|
//
|
|
730
|
-
Packet y2 =
|
|
731
|
-
y2 = pmadd(y2, x2, pset1<Packet>(
|
|
931
|
+
Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
|
|
932
|
+
y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
|
|
732
933
|
y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
|
|
733
934
|
y2 = pmul(y2, x2);
|
|
734
935
|
y2 = pmadd(y2, x, x);
|
|
735
936
|
|
|
736
937
|
// Select the correct result from the two polynomials.
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
938
|
+
if (ComputeBoth) {
|
|
939
|
+
Packet peven = peven_mask(x);
|
|
940
|
+
Packet ysin = pselect(poly_mask, y2, y1);
|
|
941
|
+
Packet ycos = pselect(poly_mask, y1, y2);
|
|
942
|
+
Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
|
|
943
|
+
Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
|
|
944
|
+
sign_bit_sin = pand(sign_bit_sin, cst_sign_mask); // clear all but left most bit
|
|
945
|
+
sign_bit_cos = pand(sign_bit_cos, cst_sign_mask); // clear all but left most bit
|
|
946
|
+
y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
|
|
947
|
+
} else {
|
|
948
|
+
y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
|
|
949
|
+
y = pxor(y, sign_bit);
|
|
950
|
+
}
|
|
740
951
|
// Update the sign and filter huge inputs
|
|
741
|
-
return
|
|
952
|
+
return y;
|
|
742
953
|
}
|
|
743
954
|
|
|
744
|
-
template<typename Packet>
|
|
745
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
746
|
-
EIGEN_UNUSED
|
|
747
|
-
Packet psin_float(const Packet& x)
|
|
748
|
-
{
|
|
955
|
+
template <typename Packet>
|
|
956
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
|
|
749
957
|
return psincos_float<true>(x);
|
|
750
958
|
}
|
|
751
959
|
|
|
752
|
-
template<typename Packet>
|
|
753
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
754
|
-
EIGEN_UNUSED
|
|
755
|
-
Packet pcos_float(const Packet& x)
|
|
756
|
-
{
|
|
960
|
+
template <typename Packet>
|
|
961
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
|
|
757
962
|
return psincos_float<false>(x);
|
|
758
963
|
}
|
|
759
964
|
|
|
965
|
+
// Trigonometric argument reduction for double for inputs smaller than 15.
|
|
966
|
+
// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
|
|
967
|
+
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
|
|
968
|
+
template <typename Packet>
|
|
969
|
+
Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
|
|
970
|
+
// Pi/2 split into 2 values
|
|
971
|
+
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
|
|
972
|
+
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
|
|
973
|
+
|
|
974
|
+
Packet t;
|
|
975
|
+
t = pmadd(cst_pio2_a, q, x);
|
|
976
|
+
t = pmadd(cst_pio2_b, q, t);
|
|
977
|
+
return t;
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// Trigonometric argument reduction for double for inputs smaller than 1e14.
|
|
981
|
+
// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
|
|
982
|
+
// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
|
|
983
|
+
template <typename Packet>
|
|
984
|
+
Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
|
|
985
|
+
// Pi/2 split into 4 values
|
|
986
|
+
const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
|
|
987
|
+
const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
|
|
988
|
+
const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
|
|
989
|
+
const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
|
|
990
|
+
|
|
991
|
+
Packet t;
|
|
992
|
+
t = pmadd(cst_pio2_a, q_high, x);
|
|
993
|
+
t = pmadd(cst_pio2_a, q_low, t);
|
|
994
|
+
t = pmadd(cst_pio2_b, q_high, t);
|
|
995
|
+
t = pmadd(cst_pio2_b, q_low, t);
|
|
996
|
+
t = pmadd(cst_pio2_c, q_high, t);
|
|
997
|
+
t = pmadd(cst_pio2_c, q_low, t);
|
|
998
|
+
t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
|
|
999
|
+
return t;
|
|
1000
|
+
}
|
|
760
1001
|
|
|
761
|
-
template<typename Packet>
|
|
1002
|
+
template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
|
|
762
1003
|
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
1004
|
+
#if EIGEN_COMP_GNUC_STRICT
|
|
1005
|
+
__attribute__((optimize("-fno-unsafe-math-optimizations")))
|
|
1006
|
+
#endif
|
|
1007
|
+
Packet
|
|
1008
|
+
psincos_double(const Packet& x) {
|
|
1009
|
+
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
|
1010
|
+
typedef typename unpacket_traits<PacketI>::type ScalarI;
|
|
768
1011
|
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
//
|
|
772
|
-
|
|
773
|
-
//
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
//
|
|
777
|
-
//
|
|
778
|
-
|
|
779
|
-
//
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
//
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
//
|
|
789
|
-
|
|
1012
|
+
const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
|
|
1013
|
+
|
|
1014
|
+
// If the argument is smaller than this value, use a simpler argument reduction
|
|
1015
|
+
const double small_th = 15;
|
|
1016
|
+
// If the argument is bigger than this value, use the non-vectorized std version
|
|
1017
|
+
const double huge_th = 1e14;
|
|
1018
|
+
|
|
1019
|
+
const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI
|
|
1020
|
+
// Integer Packet constants
|
|
1021
|
+
const PacketI cst_one = pset1<PacketI>(ScalarI(1));
|
|
1022
|
+
// Constant for splitting
|
|
1023
|
+
const Packet cst_split = pset1<Packet>(1 << 24);
|
|
1024
|
+
|
|
1025
|
+
Packet x_abs = pabs(x);
|
|
1026
|
+
|
|
1027
|
+
// Scale x by 2/Pi
|
|
1028
|
+
PacketI q_int;
|
|
1029
|
+
Packet s;
|
|
1030
|
+
|
|
1031
|
+
// TODO Implement huge angle argument reduction
|
|
1032
|
+
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
|
|
1033
|
+
Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
|
|
1034
|
+
Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
|
|
1035
|
+
q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
|
|
1036
|
+
Packet q_low = pcast<PacketI, Packet>(q_int);
|
|
1037
|
+
s = trig_reduce_medium_double(x_abs, q_high, q_low);
|
|
1038
|
+
} else {
|
|
1039
|
+
Packet qval_noround = pmul(x_abs, cst_2oPI);
|
|
1040
|
+
q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
|
|
1041
|
+
Packet q = pcast<PacketI, Packet>(q_int);
|
|
1042
|
+
s = trig_reduce_small_double(x_abs, q);
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// All the upcoming approximating polynomials have even exponents
|
|
1046
|
+
Packet ss = pmul(s, s);
|
|
1047
|
+
|
|
1048
|
+
// Padé approximant of cos(x)
|
|
1049
|
+
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
|
|
1050
|
+
// cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
|
|
1051
|
+
// 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
|
|
1052
|
+
// MATLAB code to compute those coefficients:
|
|
1053
|
+
// syms x;
|
|
1054
|
+
// cosf = @(x) cos(x);
|
|
1055
|
+
// pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
|
|
1056
|
+
Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
|
|
1057
|
+
Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
|
|
1058
|
+
Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
|
|
1059
|
+
Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
|
|
1060
|
+
Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
|
|
1061
|
+
Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
|
|
1062
|
+
Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
|
|
1063
|
+
Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
|
|
1064
|
+
Packet scos = pdiv(sc4_num, sc4_denum);
|
|
1065
|
+
|
|
1066
|
+
// Padé approximant of sin(x)
|
|
1067
|
+
// Assuring < 1 ULP error on the interval [-pi/4, pi/4]
|
|
1068
|
+
// sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
|
|
1069
|
+
// 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
|
|
1070
|
+
// MATLAB code to compute those coefficients:
|
|
1071
|
+
// syms x;
|
|
1072
|
+
// sinf = @(x) sin(x);
|
|
1073
|
+
// pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
|
|
1074
|
+
Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
|
|
1075
|
+
Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
|
|
1076
|
+
Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
|
|
1077
|
+
Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
|
|
1078
|
+
Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
|
|
1079
|
+
Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
|
|
1080
|
+
Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
|
|
1081
|
+
Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
|
|
1082
|
+
Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
|
|
1083
|
+
|
|
1084
|
+
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
|
|
1085
|
+
|
|
1086
|
+
Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
|
|
1087
|
+
Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
|
|
1088
|
+
Packet sign_bit, sFinalRes;
|
|
1089
|
+
if (ComputeBoth) {
|
|
1090
|
+
Packet peven = peven_mask(x);
|
|
1091
|
+
sign_bit = pselect((s), sign_sin, sign_cos);
|
|
1092
|
+
sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
|
|
1093
|
+
} else {
|
|
1094
|
+
sign_bit = ComputeSine ? sign_sin : sign_cos;
|
|
1095
|
+
sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
|
|
1096
|
+
}
|
|
1097
|
+
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
|
1098
|
+
sFinalRes = pxor(sFinalRes, sign_bit);
|
|
1099
|
+
|
|
1100
|
+
// If the inputs values are higher than that a value that the argument reduction can currently address, compute them
|
|
1101
|
+
// using std::sin and std::cos
|
|
1102
|
+
// TODO Remove it when huge angle argument reduction is implemented
|
|
1103
|
+
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
|
|
1104
|
+
const int PacketSize = unpacket_traits<Packet>::size;
|
|
1105
|
+
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
|
|
1106
|
+
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
|
|
1107
|
+
pstoreu(x_cpy, x);
|
|
1108
|
+
pstoreu(sincos_vals, sFinalRes);
|
|
1109
|
+
for (int k = 0; k < PacketSize; ++k) {
|
|
1110
|
+
double val = x_cpy[k];
|
|
1111
|
+
if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
|
|
1112
|
+
if (ComputeBoth)
|
|
1113
|
+
sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
|
|
1114
|
+
else
|
|
1115
|
+
sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
sFinalRes = ploadu<Packet>(sincos_vals);
|
|
1119
|
+
}
|
|
1120
|
+
return sFinalRes;
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
template <typename Packet>
|
|
1124
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
|
|
1125
|
+
return psincos_double<true>(x);
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
template <typename Packet>
|
|
1129
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
|
|
1130
|
+
return psincos_double<false>(x);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
// Generic implementation of acos(x).
|
|
1134
|
+
template <typename Packet>
|
|
1135
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
|
|
1136
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1137
|
+
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
|
1138
|
+
|
|
1139
|
+
const Packet cst_one = pset1<Packet>(Scalar(1));
|
|
1140
|
+
const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
|
|
1141
|
+
const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
|
|
1142
|
+
const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
|
|
1143
|
+
const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
|
|
1144
|
+
const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
|
|
1145
|
+
const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
|
|
1146
|
+
const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
|
|
1147
|
+
const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
|
|
1148
|
+
|
|
1149
|
+
// For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
|
|
1150
|
+
// function, by a 6'th order polynomial.
|
|
1151
|
+
// For x in [-1:0) we use that acos(-x) = pi - acos(x).
|
|
1152
|
+
const Packet neg_mask = psignbit(x_in);
|
|
1153
|
+
const Packet abs_x = pabs(x_in);
|
|
1154
|
+
|
|
1155
|
+
// Evaluate the polynomial using Horner's rule:
|
|
1156
|
+
// P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
|
|
1157
|
+
// We evaluate even and odd terms independently to increase
|
|
1158
|
+
// instruction level parallelism.
|
|
1159
|
+
Packet x2 = pmul(x_in, x_in);
|
|
1160
|
+
Packet p_even = pmadd(p6, x2, p4);
|
|
1161
|
+
Packet p_odd = pmadd(p5, x2, p3);
|
|
1162
|
+
p_even = pmadd(p_even, x2, p2);
|
|
1163
|
+
p_odd = pmadd(p_odd, x2, p1);
|
|
1164
|
+
p_even = pmadd(p_even, x2, p0);
|
|
1165
|
+
Packet p = pmadd(p_odd, abs_x, p_even);
|
|
1166
|
+
|
|
1167
|
+
// The polynomial approximates acos(x)/sqrt(1-x), so
|
|
1168
|
+
// multiply by sqrt(1-x) to get acos(x).
|
|
1169
|
+
// Conveniently returns NaN for arguments outside [-1:1].
|
|
1170
|
+
Packet denom = psqrt(psub(cst_one, abs_x));
|
|
1171
|
+
Packet result = pmul(denom, p);
|
|
1172
|
+
// Undo mapping for negative arguments.
|
|
1173
|
+
return pselect(neg_mask, psub(cst_pi, result), result);
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
// Generic implementation of asin(x).
|
|
1177
|
+
template <typename Packet>
|
|
1178
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
|
|
1179
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1180
|
+
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
|
1181
|
+
|
|
1182
|
+
constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
|
|
1183
|
+
|
|
1184
|
+
const Packet cst_half = pset1<Packet>(0.5f);
|
|
1185
|
+
const Packet cst_one = pset1<Packet>(1.0f);
|
|
1186
|
+
const Packet cst_two = pset1<Packet>(2.0f);
|
|
1187
|
+
const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
|
|
1188
|
+
|
|
1189
|
+
const Packet abs_x = pabs(x_in);
|
|
1190
|
+
const Packet sign_mask = pandnot(x_in, abs_x);
|
|
1191
|
+
const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
|
|
1192
|
+
|
|
1193
|
+
// For arguments |x| > 0.5, we map x back to [0:0.5] using
|
|
1194
|
+
// the transformation x_large = sqrt(0.5*(1-x)), and use the
|
|
1195
|
+
// identity
|
|
1196
|
+
// asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
|
|
1197
|
+
|
|
1198
|
+
const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
|
|
1199
|
+
const Packet large_mask = pcmp_lt(cst_half, abs_x);
|
|
1200
|
+
const Packet x = pselect(large_mask, x_large, abs_x);
|
|
1201
|
+
const Packet x2 = pmul(x, x);
|
|
1202
|
+
|
|
1203
|
+
// For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
|
|
1204
|
+
// even terms only.
|
|
1205
|
+
constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
|
|
1206
|
+
7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
|
|
1207
|
+
Packet p = ppolevl<Packet, 4>::run(x2, alpha);
|
|
1208
|
+
p = pmul(p, x);
|
|
1209
|
+
|
|
1210
|
+
const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
|
|
1211
|
+
p = pselect(large_mask, p_large, p);
|
|
1212
|
+
// Flip the sign for negative arguments.
|
|
1213
|
+
p = pxor(p, sign_mask);
|
|
1214
|
+
// Return NaN for arguments outside [-1:1].
|
|
1215
|
+
return por(invalid_mask, p);
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
template <typename Scalar>
|
|
1219
|
+
struct patan_reduced {
|
|
1220
|
+
template <typename Packet>
|
|
1221
|
+
static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
|
|
1222
|
+
};
|
|
1223
|
+
|
|
1224
|
+
template <>
|
|
1225
|
+
template <typename Packet>
|
|
1226
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
|
|
1227
|
+
constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
|
|
1228
|
+
3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
|
|
1229
|
+
3.3004361289279920e-01};
|
|
1230
|
+
|
|
1231
|
+
constexpr double beta[] = {
|
|
1232
|
+
2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
|
|
1233
|
+
9.3705509168587852e-01, 3.3004361289279920e-01};
|
|
1234
|
+
|
|
1235
|
+
Packet x2 = pmul(x, x);
|
|
1236
|
+
Packet p = ppolevl<Packet, 6>::run(x2, alpha);
|
|
1237
|
+
Packet q = ppolevl<Packet, 6>::run(x2, beta);
|
|
1238
|
+
return pmul(x, pdiv(p, q));
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
|
|
1242
|
+
template <>
|
|
1243
|
+
template <typename Packet>
|
|
1244
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
|
|
1245
|
+
constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
|
|
1246
|
+
|
|
1247
|
+
constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
|
|
1248
|
+
8.109951019287109375e-01f};
|
|
1249
|
+
|
|
1250
|
+
Packet x2 = pmul(x, x);
|
|
1251
|
+
Packet p = ppolevl<Packet, 2>::run(x2, alpha);
|
|
1252
|
+
Packet q = ppolevl<Packet, 3>::run(x2, beta);
|
|
1253
|
+
return pmul(x, pdiv(p, q));
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
template <typename Packet>
|
|
1257
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
|
|
1258
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1259
|
+
|
|
1260
|
+
constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
|
|
1261
|
+
|
|
1262
|
+
const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
|
|
1263
|
+
const Packet cst_one = pset1<Packet>(Scalar(1));
|
|
1264
|
+
const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
|
|
1265
|
+
|
|
1266
|
+
// "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
|
|
1267
|
+
// "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
|
|
1268
|
+
// calculated using Rminimax.
|
|
1269
|
+
|
|
1270
|
+
const Packet abs_x = pabs(x_in);
|
|
1271
|
+
const Packet x_signmask = pand(x_in, cst_signmask);
|
|
1272
|
+
const Packet large_mask = pcmp_lt(cst_one, abs_x);
|
|
1273
|
+
const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
|
|
1274
|
+
const Packet p = patan_reduced<Scalar>::run(x);
|
|
1275
|
+
// Apply transformations according to the range reduction masks.
|
|
1276
|
+
Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
|
|
1277
|
+
// Return correct sign
|
|
1278
|
+
return pxor(result, x_signmask);
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
|
|
1282
|
+
Doesn't do anything fancy, just a 9/8-degree rational interpolant which
|
|
1283
|
+
is accurate up to a couple of ulps in the (approximate) range [-8, 8],
|
|
1284
|
+
outside of which tanh(x) = +/-1 in single precision. The input is clamped
|
|
1285
|
+
to the range [-c, c]. The value c is chosen as the smallest value where
|
|
1286
|
+
the approximation evaluates to exactly 1.
|
|
1287
|
+
|
|
1288
|
+
This implementation works on both scalars and packets.
|
|
1289
|
+
*/
|
|
1290
|
+
template <typename T>
|
|
1291
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
|
|
1292
|
+
// Clamp the inputs to the range [-c, c] and set everything
|
|
1293
|
+
// outside that range to 1.0. The value c is chosen as the smallest
|
|
1294
|
+
// floating point argument such that the approximation is exactly 1.
|
|
1295
|
+
// This saves clamping the value at the end.
|
|
1296
|
+
#ifdef EIGEN_VECTORIZE_FMA
|
|
1297
|
+
const T plus_clamp = pset1<T>(8.01773357391357422f);
|
|
1298
|
+
const T minus_clamp = pset1<T>(-8.01773357391357422f);
|
|
1299
|
+
#else
|
|
1300
|
+
const T plus_clamp = pset1<T>(7.90738964080810547f);
|
|
1301
|
+
const T minus_clamp = pset1<T>(-7.90738964080810547f);
|
|
1302
|
+
#endif
|
|
1303
|
+
const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
|
|
1304
|
+
|
|
1305
|
+
// The following rational approximation was generated by rminimax
|
|
1306
|
+
// (https://gitlab.inria.fr/sfilip/rminimax) using the following
|
|
1307
|
+
// command:
|
|
1308
|
+
// $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
|
|
1309
|
+
// --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
|
|
1310
|
+
// --output=tanhf.sollya --dispCoeff="dec"
|
|
1311
|
+
|
|
1312
|
+
// The monomial coefficients of the numerator polynomial (odd).
|
|
1313
|
+
constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
|
|
1314
|
+
|
|
1315
|
+
// The monomial coefficients of the denominator polynomial (even).
|
|
1316
|
+
constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
|
|
1317
|
+
|
|
1318
|
+
// Since the polynomials are odd/even, we need x^2.
|
|
1319
|
+
const T x2 = pmul(x, x);
|
|
1320
|
+
const T x3 = pmul(x2, x);
|
|
1321
|
+
|
|
1322
|
+
T p = ppolevl<T, 3>::run(x2, alpha);
|
|
1323
|
+
T q = ppolevl<T, 4>::run(x2, beta);
|
|
1324
|
+
// Take advantage of the fact that the constant term in p is 1 to compute
|
|
1325
|
+
// x*(x^2*p + 1) = x^3 * p + x.
|
|
1326
|
+
p = pmadd(x3, p, x);
|
|
1327
|
+
|
|
1328
|
+
// Divide the numerator by the denominator.
|
|
1329
|
+
return pdiv(p, q);
|
|
1330
|
+
}
|
|
1331
|
+
|
|
1332
|
+
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
|
|
1333
|
+
This uses a 19/18-degree rational interpolant which
|
|
1334
|
+
is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
|
|
1335
|
+
outside of which tanh(x) = +/-1 in single precision. The input is clamped
|
|
1336
|
+
to the range [-c, c]. The value c is chosen as the smallest value where
|
|
1337
|
+
the approximation evaluates to exactly 1.
|
|
1338
|
+
|
|
1339
|
+
This implementation works on both scalars and packets.
|
|
1340
|
+
*/
|
|
1341
|
+
template <typename T>
|
|
1342
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
|
|
1343
|
+
// Clamp the inputs to the range [-c, c] and set everything
|
|
1344
|
+
// outside that range to 1.0. The value c is chosen as the smallest
|
|
1345
|
+
// floating point argument such that the approximation is exactly 1.
|
|
1346
|
+
// This saves clamping the value at the end.
|
|
1347
|
+
#ifdef EIGEN_VECTORIZE_FMA
|
|
1348
|
+
const T plus_clamp = pset1<T>(17.6610191624600077);
|
|
1349
|
+
const T minus_clamp = pset1<T>(-17.6610191624600077);
|
|
1350
|
+
#else
|
|
1351
|
+
const T plus_clamp = pset1<T>(17.714196154005176);
|
|
1352
|
+
const T minus_clamp = pset1<T>(-17.714196154005176);
|
|
1353
|
+
#endif
|
|
1354
|
+
const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
|
|
1355
|
+
|
|
1356
|
+
// The following rational approximation was generated by rminimax
|
|
1357
|
+
// (https://gitlab.inria.fr/sfilip/rminimax) using the following
|
|
1358
|
+
// command:
|
|
1359
|
+
// $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
|
|
1360
|
+
// --num="odd" --den="even" --type="[19,18]" --numF="[D]"
|
|
1361
|
+
// --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
|
|
1362
|
+
|
|
1363
|
+
// The monomial coefficients of the numerator polynomial (odd).
|
|
1364
|
+
constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
|
|
1365
|
+
4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
|
|
1366
|
+
9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
|
|
1367
|
+
|
|
1368
|
+
// The monomial coefficients of the denominator polynomial (even).
|
|
1369
|
+
constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
|
|
1370
|
+
1.293019623712687916e-13, 1.123643448069621992e-10,
|
|
1371
|
+
4.492975677839633985e-08, 8.785185266237658698e-06,
|
|
1372
|
+
8.295161192716231542e-04, 3.437448108450402717e-02,
|
|
1373
|
+
4.851805297361760360e-01, 1.0};
|
|
1374
|
+
|
|
1375
|
+
// Since the polynomials are odd/even, we need x^2.
|
|
1376
|
+
const T x2 = pmul(x, x);
|
|
1377
|
+
const T x3 = pmul(x2, x);
|
|
1378
|
+
|
|
1379
|
+
// Interleave the evaluation of the numerator polynomial p and
|
|
1380
|
+
// denominator polynomial q.
|
|
1381
|
+
T p = ppolevl<T, 8>::run(x2, alpha);
|
|
1382
|
+
T q = ppolevl<T, 9>::run(x2, beta);
|
|
1383
|
+
// Take advantage of the fact that the constant term in p is 1 to compute
|
|
1384
|
+
// x*(x^2*p + 1) = x^3 * p + x.
|
|
1385
|
+
p = pmadd(x3, p, x);
|
|
1386
|
+
|
|
1387
|
+
// Divide the numerator by the denominator.
|
|
1388
|
+
return pdiv(p, q);
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
template <typename Packet>
|
|
1392
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
|
|
1393
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1394
|
+
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
|
1395
|
+
|
|
1396
|
+
// For |x| in [0:0.5] we use a polynomial approximation of the form
|
|
1397
|
+
// P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
|
|
1398
|
+
constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
|
|
1399
|
+
0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
|
|
1400
|
+
const Packet x2 = pmul(x, x);
|
|
1401
|
+
const Packet x3 = pmul(x, x2);
|
|
1402
|
+
Packet p = ppolevl<Packet, 4>::run(x2, alpha);
|
|
1403
|
+
p = pmadd(x3, p, x);
|
|
1404
|
+
|
|
1405
|
+
// For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
|
|
1406
|
+
const Packet half = pset1<Packet>(0.5f);
|
|
1407
|
+
const Packet one = pset1<Packet>(1.0f);
|
|
1408
|
+
Packet r = pdiv(padd(one, x), psub(one, x));
|
|
1409
|
+
r = pmul(half, plog(r));
|
|
1410
|
+
|
|
1411
|
+
const Packet x_gt_half = pcmp_le(half, pabs(x));
|
|
1412
|
+
const Packet x_eq_one = pcmp_eq(one, pabs(x));
|
|
1413
|
+
const Packet x_gt_one = pcmp_lt(one, pabs(x));
|
|
1414
|
+
const Packet sign_mask = pset1<Packet>(-0.0f);
|
|
1415
|
+
const Packet x_sign = pand(sign_mask, x);
|
|
1416
|
+
const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
|
|
1417
|
+
return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
template <typename Packet>
|
|
1421
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
|
|
1422
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1423
|
+
static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
|
|
1424
|
+
// For x in [-0.5:0.5] we use a rational approximation of the form
|
|
1425
|
+
// R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
|
|
1426
|
+
constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
|
|
1427
|
+
-2.5949536095445679e-01, 1.2306328729812676e-01};
|
|
1428
|
+
|
|
1429
|
+
constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02, -4.2828141436397615e-01,
|
|
1430
|
+
9.8733495886883648e-01, -1.0000000000000000e+00, 3.6918986189438030e-01};
|
|
1431
|
+
|
|
1432
|
+
const Packet x2 = pmul(x, x);
|
|
1433
|
+
const Packet x3 = pmul(x, x2);
|
|
1434
|
+
Packet p = ppolevl<Packet, 4>::run(x2, alpha);
|
|
1435
|
+
Packet q = ppolevl<Packet, 5>::run(x2, beta);
|
|
1436
|
+
Packet y_small = pmadd(x3, pdiv(p, q), x);
|
|
1437
|
+
|
|
1438
|
+
// For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
|
|
1439
|
+
const Packet half = pset1<Packet>(0.5);
|
|
1440
|
+
const Packet one = pset1<Packet>(1.0);
|
|
1441
|
+
Packet y_large = pdiv(padd(one, x), psub(one, x));
|
|
1442
|
+
y_large = pmul(half, plog(y_large));
|
|
1443
|
+
|
|
1444
|
+
const Packet x_gt_half = pcmp_le(half, pabs(x));
|
|
1445
|
+
const Packet x_eq_one = pcmp_eq(one, pabs(x));
|
|
1446
|
+
const Packet x_gt_one = pcmp_lt(one, pabs(x));
|
|
1447
|
+
const Packet sign_mask = pset1<Packet>(-0.0);
|
|
1448
|
+
const Packet x_sign = pand(sign_mask, x);
|
|
1449
|
+
const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
|
|
1450
|
+
return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
template <typename Packet>
|
|
1454
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
|
|
1455
|
+
typedef typename unpacket_traits<Packet>::as_real RealPacket;
|
|
1456
|
+
// In the following we annotate the code for the case where the inputs
|
|
1457
|
+
// are a pair length-2 SIMD vectors representing a single pair of complex
|
|
1458
|
+
// numbers x = a + i*b, y = c + i*d.
|
|
1459
|
+
const RealPacket y_abs = pabs(y.v); // |c|, |d|
|
|
1460
|
+
const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
|
|
1461
|
+
const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
|
|
1462
|
+
const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|)
|
|
1463
|
+
// Compute scaled denominator.
|
|
1464
|
+
const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
|
|
1465
|
+
const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
|
|
1466
|
+
Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c
|
|
1467
|
+
// Divide elementwise by denom.
|
|
1468
|
+
result_scaled = Packet(pdiv(result_scaled.v, denom));
|
|
1469
|
+
// Rescale result
|
|
1470
|
+
return Packet(pdiv(result_scaled.v, y_max));
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
template <typename Packet>
|
|
1474
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
|
|
1475
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1476
|
+
typedef typename Scalar::value_type RealScalar;
|
|
1477
|
+
typedef typename unpacket_traits<Packet>::as_real RealPacket;
|
|
1478
|
+
|
|
1479
|
+
RealPacket real_mask_rp = peven_mask(x.v);
|
|
1480
|
+
Packet real_mask(real_mask_rp);
|
|
1481
|
+
|
|
1482
|
+
// Real part
|
|
1483
|
+
RealPacket x_flip = pcplxflip(x).v; // b, a
|
|
1484
|
+
Packet x_norm = phypot_complex(x); // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
|
|
1485
|
+
RealPacket xlogr = plog(x_norm.v); // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
|
|
1486
|
+
|
|
1487
|
+
// Imag part
|
|
1488
|
+
RealPacket ximg = patan2(x.v, x_flip); // atan2(a, b), atan2(b, a)
|
|
1489
|
+
|
|
1490
|
+
const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
|
|
1491
|
+
RealPacket x_abs = pabs(x.v);
|
|
1492
|
+
RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
|
|
1493
|
+
RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
|
|
1494
|
+
RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
|
|
1495
|
+
RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
|
|
1496
|
+
|
|
1497
|
+
Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg)); // log(sqrt(a^2 + b^2)), atan2(b, a)
|
|
1498
|
+
return xres;
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
template <typename Packet>
|
|
1502
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
|
|
1503
|
+
typedef typename unpacket_traits<Packet>::as_real RealPacket;
|
|
1504
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1505
|
+
typedef typename Scalar::value_type RealScalar;
|
|
1506
|
+
const RealPacket even_mask = peven_mask(a.v);
|
|
1507
|
+
const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
|
|
1508
|
+
|
|
1509
|
+
// Let a = x + iy.
|
|
1510
|
+
// exp(a) = exp(x) * cis(y), plus some special edge-case handling.
|
|
1511
|
+
|
|
1512
|
+
// exp(x):
|
|
1513
|
+
RealPacket x = pand(a.v, even_mask);
|
|
1514
|
+
x = por(x, pcplxflip(Packet(x)).v);
|
|
1515
|
+
RealPacket expx = pexp(x); // exp(x);
|
|
1516
|
+
|
|
1517
|
+
// cis(y):
|
|
1518
|
+
RealPacket y = pand(odd_mask, a.v);
|
|
1519
|
+
y = por(y, pcplxflip(Packet(y)).v);
|
|
1520
|
+
RealPacket cisy = psincos_float<false, RealPacket, true>(y);
|
|
1521
|
+
cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y)
|
|
1522
|
+
|
|
1523
|
+
const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
|
|
1524
|
+
const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
|
|
1525
|
+
|
|
1526
|
+
// If x is -inf, we know that cossin(y) is bounded,
|
|
1527
|
+
// so the result is (0, +/-0), where the sign of the imaginary part comes
|
|
1528
|
+
// from the sign of cossin(y).
|
|
1529
|
+
RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
|
|
1530
|
+
cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
|
|
1531
|
+
|
|
1532
|
+
// If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
|
|
1533
|
+
// is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
|
|
1534
|
+
RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
|
|
1535
|
+
cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
|
|
1536
|
+
Packet result = Packet(pmul(expx, cisy));
|
|
1537
|
+
|
|
1538
|
+
// If y is +/- 0, the input is real, so take the real result for consistency.
|
|
1539
|
+
result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
|
|
1540
|
+
|
|
1541
|
+
return result;
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
template <typename Packet>
|
|
1545
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
|
|
1546
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1547
|
+
typedef typename Scalar::value_type RealScalar;
|
|
1548
|
+
typedef typename unpacket_traits<Packet>::as_real RealPacket;
|
|
1549
|
+
|
|
1550
|
+
// Computes the principal sqrt of the complex numbers in the input.
|
|
1551
|
+
//
|
|
1552
|
+
// For example, for packets containing 2 complex numbers stored in interleaved format
|
|
1553
|
+
// a = [a0, a1] = [x0, y0, x1, y1],
|
|
1554
|
+
// where x0 = real(a0), y0 = imag(a0) etc., this function returns
|
|
1555
|
+
// b = [b0, b1] = [u0, v0, u1, v1],
|
|
1556
|
+
// such that b0^2 = a0, b1^2 = a1.
|
|
1557
|
+
//
|
|
1558
|
+
// To derive the formula for the complex square roots, let's consider the equation for
|
|
1559
|
+
// a single complex square root of the number x + i*y. We want to find real numbers
|
|
1560
|
+
// u and v such that
|
|
1561
|
+
// (u + i*v)^2 = x + i*y <=>
|
|
1562
|
+
// u^2 - v^2 + i*2*u*v = x + i*v.
|
|
1563
|
+
// By equating the real and imaginary parts we get:
|
|
1564
|
+
// u^2 - v^2 = x
|
|
1565
|
+
// 2*u*v = y.
|
|
1566
|
+
//
|
|
1567
|
+
// For x >= 0, this has the numerically stable solution
|
|
1568
|
+
// u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
|
|
1569
|
+
// v = 0.5 * (y / u)
|
|
1570
|
+
// and for x < 0,
|
|
790
1571
|
// v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
|
|
791
1572
|
// u = 0.5 * (y / v)
|
|
792
1573
|
//
|
|
@@ -802,14 +1583,14 @@ Packet psqrt_complex(const Packet& a) {
|
|
|
802
1583
|
// l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
|
|
803
1584
|
// where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
|
|
804
1585
|
|
|
805
|
-
RealPacket a_abs = pabs(a.v);
|
|
806
|
-
RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;
|
|
1586
|
+
RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
|
|
1587
|
+
RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
|
|
807
1588
|
RealPacket a_max = pmax(a_abs, a_abs_flip);
|
|
808
1589
|
RealPacket a_min = pmin(a_abs, a_abs_flip);
|
|
809
1590
|
RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
|
|
810
1591
|
RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
|
|
811
1592
|
RealPacket r = pdiv(a_min, a_max);
|
|
812
|
-
const RealPacket cst_one
|
|
1593
|
+
const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
|
|
813
1594
|
RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
|
|
814
1595
|
// Set l to a_max if a_min is zero.
|
|
815
1596
|
l = pselect(a_min_zero_mask, a_max, l);
|
|
@@ -832,8 +1613,7 @@ Packet psqrt_complex(const Packet& a) {
|
|
|
832
1613
|
|
|
833
1614
|
// Step 4. Compute solution for inputs with negative real part:
|
|
834
1615
|
// [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
|
|
835
|
-
const
|
|
836
|
-
const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), neg_zero)).v;
|
|
1616
|
+
const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
|
|
837
1617
|
RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
|
|
838
1618
|
Packet negative_real_result;
|
|
839
1619
|
// Notice that rho is positive, so taking it's absolute value is a noop.
|
|
@@ -866,11 +1646,135 @@ Packet psqrt_complex(const Packet& a) {
|
|
|
866
1646
|
is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
|
|
867
1647
|
Packet imag_inf_result;
|
|
868
1648
|
imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
|
|
1649
|
+
// unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
|
|
1650
|
+
Packet result_is_nan = pisnan(result);
|
|
1651
|
+
result = por(result_is_nan, result);
|
|
869
1652
|
|
|
870
|
-
return
|
|
871
|
-
pselect(is_real_inf, real_inf_result,result));
|
|
1653
|
+
return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
|
|
872
1654
|
}
|
|
873
1655
|
|
|
1656
|
+
// \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
|
|
1657
|
+
// Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
|
|
1658
|
+
template <typename Packet>
|
|
1659
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
|
|
1660
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1661
|
+
typedef typename Scalar::value_type RealScalar;
|
|
1662
|
+
typedef typename unpacket_traits<Packet>::as_real RealPacket;
|
|
1663
|
+
|
|
1664
|
+
const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
|
|
1665
|
+
const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
|
|
1666
|
+
const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
|
|
1667
|
+
const RealPacket evenmask = peven_mask(a.v);
|
|
1668
|
+
|
|
1669
|
+
RealPacket a_abs = pabs(a.v);
|
|
1670
|
+
RealPacket a_flip = pcplxflip(Packet(a_abs)).v; // |b|, |a|
|
|
1671
|
+
RealPacket a_all = pselect(evenmask, a_abs, a_flip); // |a|, |a|
|
|
1672
|
+
RealPacket b_all = pselect(evenmask, a_flip, a_abs); // |b|, |b|
|
|
1673
|
+
|
|
1674
|
+
RealPacket a2 = pmul(a.v, a.v); // |a^2, b^2|
|
|
1675
|
+
RealPacket a2_flip = pcplxflip(Packet(a2)).v; // |b^2, a^2|
|
|
1676
|
+
RealPacket h = psqrt(padd(a2, a2_flip)); // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
|
|
1677
|
+
RealPacket h_sq = pmul(h, h); // |a^2 + b^2, a^2 + b^2|
|
|
1678
|
+
RealPacket a_sq = pselect(evenmask, a2, a2_flip); // |a^2, a^2|
|
|
1679
|
+
RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
|
|
1680
|
+
RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
|
|
1681
|
+
RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
|
|
1682
|
+
h = psub(h, pdiv(x, pmul(cst_two_rp, h))); // |h - x/(2*h), h - x/(2*h)|
|
|
1683
|
+
|
|
1684
|
+
// handle zero-case
|
|
1685
|
+
RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
|
|
1686
|
+
|
|
1687
|
+
h = pandnot(h, iszero); // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
|
|
1688
|
+
return Packet(h); // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
template <typename Packet>
|
|
1692
|
+
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
|
|
1693
|
+
!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
|
|
1694
|
+
!NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
|
|
1695
|
+
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
|
|
1696
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
1697
|
+
const Packet cst_one = pset1<Packet>(Scalar(1));
|
|
1698
|
+
const Packet cst_zero = pzero(a);
|
|
1699
|
+
|
|
1700
|
+
const Packet abs_a = pabs(a);
|
|
1701
|
+
const Packet sign_mask = pandnot(a, abs_a);
|
|
1702
|
+
const Packet nonzero_mask = pcmp_lt(cst_zero, abs_a);
|
|
1703
|
+
|
|
1704
|
+
return pselect(nonzero_mask, por(sign_mask, cst_one), abs_a);
|
|
1705
|
+
}
|
|
1706
|
+
};
|
|
1707
|
+
|
|
1708
|
+
template <typename Packet>
|
|
1709
|
+
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
|
|
1710
|
+
!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
|
|
1711
|
+
NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
|
|
1712
|
+
NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
|
|
1713
|
+
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
|
|
1714
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
1715
|
+
const Packet cst_one = pset1<Packet>(Scalar(1));
|
|
1716
|
+
const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
|
|
1717
|
+
const Packet cst_zero = pzero(a);
|
|
1718
|
+
|
|
1719
|
+
const Packet positive_mask = pcmp_lt(cst_zero, a);
|
|
1720
|
+
const Packet positive = pand(positive_mask, cst_one);
|
|
1721
|
+
const Packet negative_mask = pcmp_lt(a, cst_zero);
|
|
1722
|
+
const Packet negative = pand(negative_mask, cst_minus_one);
|
|
1723
|
+
|
|
1724
|
+
return por(positive, negative);
|
|
1725
|
+
}
|
|
1726
|
+
};
|
|
1727
|
+
|
|
1728
|
+
template <typename Packet>
|
|
1729
|
+
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
|
|
1730
|
+
!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
|
|
1731
|
+
!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
|
|
1732
|
+
NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
|
|
1733
|
+
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
|
|
1734
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
1735
|
+
const Packet cst_one = pset1<Packet>(Scalar(1));
|
|
1736
|
+
const Packet cst_zero = pzero(a);
|
|
1737
|
+
|
|
1738
|
+
const Packet zero_mask = pcmp_eq(cst_zero, a);
|
|
1739
|
+
return pandnot(cst_one, zero_mask);
|
|
1740
|
+
}
|
|
1741
|
+
};
|
|
1742
|
+
|
|
1743
|
+
// \internal \returns the the sign of a complex number z, defined as z / abs(z).
|
|
1744
|
+
template <typename Packet>
|
|
1745
|
+
struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
|
|
1746
|
+
NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
|
|
1747
|
+
unpacket_traits<Packet>::vectorizable>> {
|
|
1748
|
+
static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
|
|
1749
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1750
|
+
typedef typename Scalar::value_type RealScalar;
|
|
1751
|
+
typedef typename unpacket_traits<Packet>::as_real RealPacket;
|
|
1752
|
+
|
|
1753
|
+
// Step 1. Compute (for each element z = x + i*y in a)
|
|
1754
|
+
// l = abs(z) = sqrt(x^2 + y^2).
|
|
1755
|
+
// To avoid over- and underflow, we use the stable formula for each hypotenuse
|
|
1756
|
+
// l = (zmin == 0 ? zmax : zmax * sqrt(1 + (zmin/zmax)**2)),
|
|
1757
|
+
// where zmax = max(|x|, |y|), zmin = min(|x|, |y|),
|
|
1758
|
+
RealPacket a_abs = pabs(a.v);
|
|
1759
|
+
RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;
|
|
1760
|
+
RealPacket a_max = pmax(a_abs, a_abs_flip);
|
|
1761
|
+
RealPacket a_min = pmin(a_abs, a_abs_flip);
|
|
1762
|
+
RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
|
|
1763
|
+
RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
|
|
1764
|
+
RealPacket r = pdiv(a_min, a_max);
|
|
1765
|
+
const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
|
|
1766
|
+
RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
|
|
1767
|
+
// Set l to a_max if a_min is zero, since the roundtrip sqrt(a_max^2) may be
|
|
1768
|
+
// lossy.
|
|
1769
|
+
l = pselect(a_min_zero_mask, a_max, l);
|
|
1770
|
+
// Step 2 compute a / abs(a).
|
|
1771
|
+
RealPacket sign_as_real = pandnot(pdiv(a.v, l), a_max_zero_mask);
|
|
1772
|
+
Packet sign;
|
|
1773
|
+
sign.v = sign_as_real;
|
|
1774
|
+
return sign;
|
|
1775
|
+
}
|
|
1776
|
+
};
|
|
1777
|
+
|
|
874
1778
|
// TODO(rmlarsen): The following set of utilities for double word arithmetic
|
|
875
1779
|
// should perhaps be refactored as a separate file, since it would be generally
|
|
876
1780
|
// useful for special function implementation etc. Writing the algorithms in
|
|
@@ -878,34 +1782,37 @@ Packet psqrt_complex(const Packet& a) {
|
|
|
878
1782
|
|
|
879
1783
|
// This function splits x into the nearest integer n and fractional part r,
|
|
880
1784
|
// such that x = n + r holds exactly.
|
|
881
|
-
template<typename Packet>
|
|
882
|
-
EIGEN_STRONG_INLINE
|
|
883
|
-
void absolute_split(const Packet& x, Packet& n, Packet& r) {
|
|
1785
|
+
template <typename Packet>
|
|
1786
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
|
|
884
1787
|
n = pround(x);
|
|
885
1788
|
r = psub(x, n);
|
|
886
1789
|
}
|
|
887
1790
|
|
|
888
1791
|
// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
|
|
889
1792
|
// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
|
|
890
|
-
template<typename Packet>
|
|
891
|
-
EIGEN_STRONG_INLINE
|
|
892
|
-
void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
|
|
1793
|
+
template <typename Packet>
|
|
1794
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
|
|
893
1795
|
s_hi = padd(x, y);
|
|
894
1796
|
const Packet t = psub(s_hi, x);
|
|
895
1797
|
s_lo = psub(y, t);
|
|
896
1798
|
}
|
|
897
1799
|
|
|
898
|
-
#ifdef
|
|
1800
|
+
#ifdef EIGEN_VECTORIZE_FMA
|
|
899
1801
|
// This function implements the extended precision product of
|
|
900
1802
|
// a pair of floating point numbers. Given {x, y}, it computes the pair
|
|
901
1803
|
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
|
|
902
1804
|
// p_hi = fl(x * y).
|
|
903
|
-
template<typename Packet>
|
|
904
|
-
EIGEN_STRONG_INLINE
|
|
905
|
-
void twoprod(const Packet& x, const Packet& y,
|
|
906
|
-
Packet& p_hi, Packet& p_lo) {
|
|
1805
|
+
template <typename Packet>
|
|
1806
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
|
|
907
1807
|
p_hi = pmul(x, y);
|
|
908
|
-
p_lo =
|
|
1808
|
+
p_lo = pmsub(x, y, p_hi);
|
|
1809
|
+
}
|
|
1810
|
+
|
|
1811
|
+
// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
|
|
1812
|
+
// x * y = xy + p_lo holds exactly.
|
|
1813
|
+
template <typename Packet>
|
|
1814
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
|
|
1815
|
+
return pmsub(x, y, xy);
|
|
909
1816
|
}
|
|
910
1817
|
|
|
911
1818
|
#else
|
|
@@ -915,11 +1822,10 @@ void twoprod(const Packet& x, const Packet& y,
|
|
|
915
1822
|
// exactly and that half of the significant of x fits in x_hi.
|
|
916
1823
|
// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
|
|
917
1824
|
// 3rd edition, Birkh\"auser, 2016.
|
|
918
|
-
template<typename Packet>
|
|
919
|
-
EIGEN_STRONG_INLINE
|
|
920
|
-
void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
|
|
1825
|
+
template <typename Packet>
|
|
1826
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
|
|
921
1827
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
922
|
-
|
|
1828
|
+
constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
|
|
923
1829
|
const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr.
|
|
924
1830
|
const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
|
|
925
1831
|
Packet rho = psub(x, gamma);
|
|
@@ -931,10 +1837,8 @@ void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
|
|
|
931
1837
|
// Given floating point numbers {x, y} computes the pair
|
|
932
1838
|
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
|
|
933
1839
|
// p_hi = fl(x * y).
|
|
934
|
-
template<typename Packet>
|
|
935
|
-
EIGEN_STRONG_INLINE
|
|
936
|
-
void twoprod(const Packet& x, const Packet& y,
|
|
937
|
-
Packet& p_hi, Packet& p_lo) {
|
|
1840
|
+
template <typename Packet>
|
|
1841
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
|
|
938
1842
|
Packet x_hi, x_lo, y_hi, y_lo;
|
|
939
1843
|
veltkamp_splitting(x, x_hi, x_lo);
|
|
940
1844
|
veltkamp_splitting(y, y_hi, y_lo);
|
|
@@ -946,8 +1850,22 @@ void twoprod(const Packet& x, const Packet& y,
|
|
|
946
1850
|
p_lo = pmadd(x_lo, y_lo, p_lo);
|
|
947
1851
|
}
|
|
948
1852
|
|
|
949
|
-
|
|
1853
|
+
// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
|
|
1854
|
+
// x * y = xy + p_lo holds exactly.
|
|
1855
|
+
template <typename Packet>
|
|
1856
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
|
|
1857
|
+
Packet x_hi, x_lo, y_hi, y_lo;
|
|
1858
|
+
veltkamp_splitting(x, x_hi, x_lo);
|
|
1859
|
+
veltkamp_splitting(y, y_hi, y_lo);
|
|
950
1860
|
|
|
1861
|
+
Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
|
|
1862
|
+
p_lo = pmadd(x_hi, y_lo, p_lo);
|
|
1863
|
+
p_lo = pmadd(x_lo, y_hi, p_lo);
|
|
1864
|
+
p_lo = pmadd(x_lo, y_lo, p_lo);
|
|
1865
|
+
return p_lo;
|
|
1866
|
+
}
|
|
1867
|
+
|
|
1868
|
+
#endif // EIGEN_VECTORIZE_FMA
|
|
951
1869
|
|
|
952
1870
|
// This function implements Dekker's algorithm for the addition
|
|
953
1871
|
// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
|
|
@@ -955,16 +1873,14 @@ void twoprod(const Packet& x, const Packet& y,
|
|
|
955
1873
|
// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
|
|
956
1874
|
// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
|
|
957
1875
|
// 3rd edition, Birkh\"auser, 2016.
|
|
958
|
-
template<typename Packet>
|
|
959
|
-
EIGEN_STRONG_INLINE
|
|
960
|
-
|
|
961
|
-
const Packet& y_hi, const Packet& y_lo,
|
|
962
|
-
Packet& s_hi, Packet& s_lo) {
|
|
1876
|
+
template <typename Packet>
|
|
1877
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
|
|
1878
|
+
const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
|
|
963
1879
|
const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
|
|
964
1880
|
Packet r_hi_1, r_lo_1;
|
|
965
|
-
fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
|
|
1881
|
+
fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
|
|
966
1882
|
Packet r_hi_2, r_lo_2;
|
|
967
|
-
fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
|
|
1883
|
+
fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
|
|
968
1884
|
const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
|
|
969
1885
|
|
|
970
1886
|
const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
|
|
@@ -976,11 +1892,9 @@ EIGEN_STRONG_INLINE
|
|
|
976
1892
|
|
|
977
1893
|
// This is a version of twosum for double word numbers,
|
|
978
1894
|
// which assumes that |x_hi| >= |y_hi|.
|
|
979
|
-
template<typename Packet>
|
|
980
|
-
EIGEN_STRONG_INLINE
|
|
981
|
-
|
|
982
|
-
const Packet& y_hi, const Packet& y_lo,
|
|
983
|
-
Packet& s_hi, Packet& s_lo) {
|
|
1895
|
+
template <typename Packet>
|
|
1896
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
|
|
1897
|
+
const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
|
|
984
1898
|
Packet r_hi, r_lo;
|
|
985
1899
|
fast_twosum(x_hi, y_hi, r_hi, r_lo);
|
|
986
1900
|
const Packet s = padd(padd(y_lo, r_lo), x_lo);
|
|
@@ -990,11 +1904,9 @@ EIGEN_STRONG_INLINE
|
|
|
990
1904
|
// This is a version of twosum for adding a floating point number x to
|
|
991
1905
|
// double word number {y_hi, y_lo} number, with the assumption
|
|
992
1906
|
// that |x| >= |y_hi|.
|
|
993
|
-
template<typename Packet>
|
|
994
|
-
EIGEN_STRONG_INLINE
|
|
995
|
-
|
|
996
|
-
const Packet& y_hi, const Packet& y_lo,
|
|
997
|
-
Packet& s_hi, Packet& s_lo) {
|
|
1907
|
+
template <typename Packet>
|
|
1908
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
|
|
1909
|
+
Packet& s_hi, Packet& s_lo) {
|
|
998
1910
|
Packet r_hi, r_lo;
|
|
999
1911
|
fast_twosum(x, y_hi, r_hi, r_lo);
|
|
1000
1912
|
const Packet s = padd(y_lo, r_lo);
|
|
@@ -1009,10 +1921,9 @@ void fast_twosum(const Packet& x,
|
|
|
1009
1921
|
// in the floating point type.
|
|
1010
1922
|
// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
|
|
1011
1923
|
// 3rd edition, Birkh\"auser, 2016.
|
|
1012
|
-
template<typename Packet>
|
|
1013
|
-
EIGEN_STRONG_INLINE
|
|
1014
|
-
|
|
1015
|
-
Packet& p_hi, Packet& p_lo) {
|
|
1924
|
+
template <typename Packet>
|
|
1925
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
|
|
1926
|
+
Packet& p_hi, Packet& p_lo) {
|
|
1016
1927
|
Packet c_hi, c_lo1;
|
|
1017
1928
|
twoprod(x_hi, y, c_hi, c_lo1);
|
|
1018
1929
|
const Packet c_lo2 = pmul(x_lo, y);
|
|
@@ -1028,11 +1939,9 @@ void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
|
|
|
1028
1939
|
// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
|
|
1029
1940
|
// of less than 2*2^{-2p}, where p is the number of significand bit
|
|
1030
1941
|
// in the floating point type.
|
|
1031
|
-
template<typename Packet>
|
|
1032
|
-
EIGEN_STRONG_INLINE
|
|
1033
|
-
|
|
1034
|
-
const Packet& y_hi, const Packet& y_lo,
|
|
1035
|
-
Packet& p_hi, Packet& p_lo) {
|
|
1942
|
+
template <typename Packet>
|
|
1943
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
|
|
1944
|
+
const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
|
|
1036
1945
|
Packet p_hi_hi, p_hi_lo;
|
|
1037
1946
|
twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
|
|
1038
1947
|
Packet p_lo_hi, p_lo_lo;
|
|
@@ -1040,120 +1949,81 @@ void twoprod(const Packet& x_hi, const Packet& x_lo,
|
|
|
1040
1949
|
fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
|
|
1041
1950
|
}
|
|
1042
1951
|
|
|
1043
|
-
// This function
|
|
1044
|
-
//
|
|
1952
|
+
// This function implements the division of double word {x_hi, x_lo}
|
|
1953
|
+
// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
|
|
1954
|
+
// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
|
|
1955
|
+
// 2017. https://hal.archives-ouvertes.fr/hal-01351529
|
|
1045
1956
|
template <typename Packet>
|
|
1046
|
-
void
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
Packet
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
// -a*x_i
|
|
1057
|
-
Packet t1_hi, t1_lo;
|
|
1058
|
-
twoprod(pnegate(x), approx_recip, t1_hi, t1_lo);
|
|
1059
|
-
// 2 - a*x_i
|
|
1060
|
-
Packet t2_hi, t2_lo;
|
|
1061
|
-
fast_twosum(pset1<Packet>(Scalar(2)), t1_hi, t2_hi, t2_lo);
|
|
1062
|
-
Packet t3_hi, t3_lo;
|
|
1063
|
-
fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo);
|
|
1064
|
-
// x_i * (2 - a * x_i)
|
|
1065
|
-
twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo);
|
|
1957
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
|
|
1958
|
+
Packet& z_hi, Packet& z_lo) {
|
|
1959
|
+
const Packet t_hi = pdiv(x_hi, y);
|
|
1960
|
+
Packet pi_hi, pi_lo;
|
|
1961
|
+
twoprod(t_hi, y, pi_hi, pi_lo);
|
|
1962
|
+
const Packet delta_hi = psub(x_hi, pi_hi);
|
|
1963
|
+
const Packet delta_t = psub(delta_hi, pi_lo);
|
|
1964
|
+
const Packet delta = padd(delta_t, x_lo);
|
|
1965
|
+
const Packet t_lo = pdiv(delta, y);
|
|
1966
|
+
fast_twosum(t_hi, t_lo, z_hi, z_lo);
|
|
1066
1967
|
}
|
|
1067
1968
|
|
|
1068
|
-
|
|
1069
1969
|
// This function computes log2(x) and returns the result as a double word.
|
|
1070
1970
|
template <typename Scalar>
|
|
1071
1971
|
struct accurate_log2 {
|
|
1072
1972
|
template <typename Packet>
|
|
1073
|
-
EIGEN_STRONG_INLINE
|
|
1074
|
-
void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
|
|
1973
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
|
|
1075
1974
|
log2_x_hi = plog2(x);
|
|
1076
1975
|
log2_x_lo = pzero(x);
|
|
1077
1976
|
}
|
|
1078
1977
|
};
|
|
1079
1978
|
|
|
1080
1979
|
// This specialization uses a more accurate algorithm to compute log2(x) for
|
|
1081
|
-
// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.
|
|
1980
|
+
// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
|
|
1082
1981
|
// This additional accuracy is needed to counter the error-magnification
|
|
1083
1982
|
// inherent in multiplying by a potentially large exponent in pow(x,y).
|
|
1084
|
-
// The minimax polynomial used was calculated using the
|
|
1085
|
-
//
|
|
1983
|
+
// The minimax polynomial used was calculated using the Rminimax tool,
|
|
1984
|
+
// see https://gitlab.inria.fr/sfilip/rminimax.
|
|
1985
|
+
// Command line:
|
|
1986
|
+
// $ ratapprox --function="log2(1+x)/x" --dom='[-0.2929,0.41422]'
|
|
1987
|
+
// --type=[10,0]
|
|
1988
|
+
// --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
|
|
1989
|
+
//
|
|
1990
|
+
// The resulting implementation of pow(x,y) is accurate to 3 ulps.
|
|
1086
1991
|
template <>
|
|
1087
1992
|
struct accurate_log2<float> {
|
|
1088
1993
|
template <typename Packet>
|
|
1089
|
-
EIGEN_STRONG_INLINE
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
const Packet p2 = pset1<Packet>( 0.2050433009862f);
|
|
1111
|
-
const Packet p1 = pset1<Packet>(-0.2404672354459f);
|
|
1112
|
-
const Packet p0 = pset1<Packet>( 0.2885761857032f);
|
|
1113
|
-
|
|
1114
|
-
const Packet C3_hi = pset1<Packet>(-0.360674142838f);
|
|
1115
|
-
const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
|
|
1116
|
-
const Packet C2_hi = pset1<Packet>(0.480897903442f);
|
|
1117
|
-
const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
|
|
1118
|
-
const Packet C1_hi = pset1<Packet>(-0.721347510815f);
|
|
1119
|
-
const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
|
|
1120
|
-
const Packet C0_hi = pset1<Packet>(1.44269502163f);
|
|
1121
|
-
const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
|
|
1994
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
|
|
1995
|
+
// Split the two lowest order constant coefficient into double-word representation.
|
|
1996
|
+
constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
|
|
1997
|
+
constexpr float kC0_hi = static_cast<float>(kC0);
|
|
1998
|
+
constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
|
|
1999
|
+
const Packet c0_hi = pset1<Packet>(kC0_hi);
|
|
2000
|
+
const Packet c0_lo = pset1<Packet>(kC0_lo);
|
|
2001
|
+
|
|
2002
|
+
constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
|
|
2003
|
+
constexpr float kC1_hi = static_cast<float>(kC1);
|
|
2004
|
+
constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
|
|
2005
|
+
const Packet c1_hi = pset1<Packet>(kC1_hi);
|
|
2006
|
+
const Packet c1_lo = pset1<Packet>(kC1_lo);
|
|
2007
|
+
|
|
2008
|
+
constexpr float c[] = {
|
|
2009
|
+
9.7010828554630279541015625e-02, -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
|
|
2010
|
+
-1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01, -2.4046677350997924804687500e-01,
|
|
2011
|
+
2.8857553005218505859375000e-01, -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
|
|
2012
|
+
|
|
2013
|
+
// Evaluate the higher order terms in the polynomial using
|
|
2014
|
+
// standard arithmetic.
|
|
1122
2015
|
const Packet one = pset1<Packet>(1.0f);
|
|
1123
|
-
|
|
1124
2016
|
const Packet x = psub(z, one);
|
|
1125
|
-
|
|
1126
|
-
//
|
|
1127
|
-
//
|
|
1128
|
-
Packet
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
// Now evaluate the low-order tems of Q(x) in double word precision.
|
|
1137
|
-
// In the following, due to the alternating signs and the fact that
|
|
1138
|
-
// |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
|
|
1139
|
-
// fast_twosum instead of the slower twosum.
|
|
1140
|
-
Packet q_hi, q_lo;
|
|
1141
|
-
Packet t_hi, t_lo;
|
|
1142
|
-
// C3 + x * p(x)
|
|
1143
|
-
twoprod(p, x, t_hi, t_lo);
|
|
1144
|
-
fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
|
|
1145
|
-
// C2 + x * p(x)
|
|
1146
|
-
twoprod(q_hi, q_lo, x, t_hi, t_lo);
|
|
1147
|
-
fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
|
|
1148
|
-
// C1 + x * p(x)
|
|
1149
|
-
twoprod(q_hi, q_lo, x, t_hi, t_lo);
|
|
1150
|
-
fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
|
|
1151
|
-
// C0 + x * p(x)
|
|
1152
|
-
twoprod(q_hi, q_lo, x, t_hi, t_lo);
|
|
1153
|
-
fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
|
|
1154
|
-
|
|
1155
|
-
// log(z) ~= x * Q(x)
|
|
1156
|
-
twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
|
|
2017
|
+
Packet p = ppolevl<Packet, 8>::run(x, c);
|
|
2018
|
+
// Evaluate the final two step in Horner's rule using double-word
|
|
2019
|
+
// arithmetic.
|
|
2020
|
+
Packet p_hi, p_lo;
|
|
2021
|
+
twoprod(x, p, p_hi, p_lo);
|
|
2022
|
+
fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
|
|
2023
|
+
twoprod(p_hi, p_lo, x, p_hi, p_lo);
|
|
2024
|
+
fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
|
|
2025
|
+
// Multiply by x to recover log2(z).
|
|
2026
|
+
twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
|
|
1157
2027
|
}
|
|
1158
2028
|
};
|
|
1159
2029
|
|
|
@@ -1167,8 +2037,7 @@ struct accurate_log2<float> {
|
|
|
1167
2037
|
template <>
|
|
1168
2038
|
struct accurate_log2<double> {
|
|
1169
2039
|
template <typename Packet>
|
|
1170
|
-
EIGEN_STRONG_INLINE
|
|
1171
|
-
void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
|
|
2040
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
|
|
1172
2041
|
// We use a transformation of variables:
|
|
1173
2042
|
// r = c * (x-1) / (x+1),
|
|
1174
2043
|
// such that
|
|
@@ -1204,16 +2073,13 @@ struct accurate_log2<double> {
|
|
|
1204
2073
|
const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
|
|
1205
2074
|
const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
|
|
1206
2075
|
// c * (x - 1)
|
|
1207
|
-
Packet
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
//
|
|
1211
|
-
// 1 / (x + 1)
|
|
1212
|
-
Packet denom_hi, denom_lo;
|
|
1213
|
-
doubleword_reciprocal(padd(x, one), denom_hi, denom_lo);
|
|
1214
|
-
// r = c * (x-1) / (x+1),
|
|
2076
|
+
Packet t_hi, t_lo;
|
|
2077
|
+
// t = c * (x-1)
|
|
2078
|
+
twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
|
|
2079
|
+
// r = c * (x-1) / (x+1),
|
|
1215
2080
|
Packet r_hi, r_lo;
|
|
1216
|
-
|
|
2081
|
+
doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
|
|
2082
|
+
|
|
1217
2083
|
// r2 = r * r
|
|
1218
2084
|
Packet r2_hi, r2_lo;
|
|
1219
2085
|
twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
|
|
@@ -1252,157 +2118,20 @@ struct accurate_log2<double> {
|
|
|
1252
2118
|
}
|
|
1253
2119
|
};
|
|
1254
2120
|
|
|
1255
|
-
// This function computes exp2(x) (i.e. 2**x).
|
|
1256
|
-
template <typename Scalar>
|
|
1257
|
-
struct fast_accurate_exp2 {
|
|
1258
|
-
template <typename Packet>
|
|
1259
|
-
EIGEN_STRONG_INLINE
|
|
1260
|
-
Packet operator()(const Packet& x) {
|
|
1261
|
-
// TODO(rmlarsen): Add a pexp2 packetop.
|
|
1262
|
-
return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
|
|
1263
|
-
}
|
|
1264
|
-
};
|
|
1265
|
-
|
|
1266
|
-
// This specialization uses a faster algorithm to compute exp2(x) for floats
|
|
1267
|
-
// in [-0.5;0.5] with a relative accuracy of 1 ulp.
|
|
1268
|
-
// The minimax polynomial used was calculated using the Sollya tool.
|
|
1269
|
-
// See sollya.org.
|
|
1270
|
-
template <>
|
|
1271
|
-
struct fast_accurate_exp2<float> {
|
|
1272
|
-
template <typename Packet>
|
|
1273
|
-
EIGEN_STRONG_INLINE
|
|
1274
|
-
Packet operator()(const Packet& x) {
|
|
1275
|
-
// This function approximates exp2(x) by a degree 6 polynomial of the form
|
|
1276
|
-
// Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
|
|
1277
|
-
// single precision, and the remaining steps are evaluated with extra precision using
|
|
1278
|
-
// double word arithmetic. C is an extra precise constant stored as a double word.
|
|
1279
|
-
//
|
|
1280
|
-
// The polynomial coefficients were calculated using Sollya commands:
|
|
1281
|
-
// > n = 6;
|
|
1282
|
-
// > f = 2^x;
|
|
1283
|
-
// > interval = [-0.5;0.5];
|
|
1284
|
-
// > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
|
|
1285
|
-
|
|
1286
|
-
const Packet p4 = pset1<Packet>(1.539513905e-4f);
|
|
1287
|
-
const Packet p3 = pset1<Packet>(1.340007293e-3f);
|
|
1288
|
-
const Packet p2 = pset1<Packet>(9.618283249e-3f);
|
|
1289
|
-
const Packet p1 = pset1<Packet>(5.550328270e-2f);
|
|
1290
|
-
const Packet p0 = pset1<Packet>(0.2402264923f);
|
|
1291
|
-
|
|
1292
|
-
const Packet C_hi = pset1<Packet>(0.6931471825f);
|
|
1293
|
-
const Packet C_lo = pset1<Packet>(2.36836577e-08f);
|
|
1294
|
-
const Packet one = pset1<Packet>(1.0f);
|
|
1295
|
-
|
|
1296
|
-
// Evaluate P(x) in working precision.
|
|
1297
|
-
// We evaluate even and odd parts of the polynomial separately
|
|
1298
|
-
// to gain some instruction level parallelism.
|
|
1299
|
-
Packet x2 = pmul(x,x);
|
|
1300
|
-
Packet p_even = pmadd(p4, x2, p2);
|
|
1301
|
-
Packet p_odd = pmadd(p3, x2, p1);
|
|
1302
|
-
p_even = pmadd(p_even, x2, p0);
|
|
1303
|
-
Packet p = pmadd(p_odd, x, p_even);
|
|
1304
|
-
|
|
1305
|
-
// Evaluate the remaining terms of Q(x) with extra precision using
|
|
1306
|
-
// double word arithmetic.
|
|
1307
|
-
Packet p_hi, p_lo;
|
|
1308
|
-
// x * p(x)
|
|
1309
|
-
twoprod(p, x, p_hi, p_lo);
|
|
1310
|
-
// C + x * p(x)
|
|
1311
|
-
Packet q1_hi, q1_lo;
|
|
1312
|
-
twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
|
|
1313
|
-
// x * (C + x * p(x))
|
|
1314
|
-
Packet q2_hi, q2_lo;
|
|
1315
|
-
twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
|
|
1316
|
-
// 1 + x * (C + x * p(x))
|
|
1317
|
-
Packet q3_hi, q3_lo;
|
|
1318
|
-
// Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
|
|
1319
|
-
// for adding it to unity here.
|
|
1320
|
-
fast_twosum(one, q2_hi, q3_hi, q3_lo);
|
|
1321
|
-
return padd(q3_hi, padd(q2_lo, q3_lo));
|
|
1322
|
-
}
|
|
1323
|
-
};
|
|
1324
|
-
|
|
1325
|
-
// in [-0.5;0.5] with a relative accuracy of 1 ulp.
|
|
1326
|
-
// The minimax polynomial used was calculated using the Sollya tool.
|
|
1327
|
-
// See sollya.org.
|
|
1328
|
-
template <>
|
|
1329
|
-
struct fast_accurate_exp2<double> {
|
|
1330
|
-
template <typename Packet>
|
|
1331
|
-
EIGEN_STRONG_INLINE
|
|
1332
|
-
Packet operator()(const Packet& x) {
|
|
1333
|
-
// This function approximates exp2(x) by a degree 10 polynomial of the form
|
|
1334
|
-
// Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
|
|
1335
|
-
// single precision, and the remaining steps are evaluated with extra precision using
|
|
1336
|
-
// double word arithmetic. C is an extra precise constant stored as a double word.
|
|
1337
|
-
//
|
|
1338
|
-
// The polynomial coefficients were calculated using Sollya commands:
|
|
1339
|
-
// > n = 11;
|
|
1340
|
-
// > f = 2^x;
|
|
1341
|
-
// > interval = [-0.5;0.5];
|
|
1342
|
-
// > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating);
|
|
1343
|
-
|
|
1344
|
-
const Packet p9 = pset1<Packet>(4.431642109085495276e-10);
|
|
1345
|
-
const Packet p8 = pset1<Packet>(7.073829923303358410e-9);
|
|
1346
|
-
const Packet p7 = pset1<Packet>(1.017822306737031311e-7);
|
|
1347
|
-
const Packet p6 = pset1<Packet>(1.321543498017646657e-6);
|
|
1348
|
-
const Packet p5 = pset1<Packet>(1.525273342728892877e-5);
|
|
1349
|
-
const Packet p4 = pset1<Packet>(1.540353045780084423e-4);
|
|
1350
|
-
const Packet p3 = pset1<Packet>(1.333355814685869807e-3);
|
|
1351
|
-
const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
|
|
1352
|
-
const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
|
|
1353
|
-
const Packet p0 = pset1<Packet>(0.240226506959101332);
|
|
1354
|
-
const Packet C_hi = pset1<Packet>(0.693147180559945286);
|
|
1355
|
-
const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
|
|
1356
|
-
const Packet one = pset1<Packet>(1.0);
|
|
1357
|
-
|
|
1358
|
-
// Evaluate P(x) in working precision.
|
|
1359
|
-
// We evaluate even and odd parts of the polynomial separately
|
|
1360
|
-
// to gain some instruction level parallelism.
|
|
1361
|
-
Packet x2 = pmul(x,x);
|
|
1362
|
-
Packet p_even = pmadd(p8, x2, p6);
|
|
1363
|
-
Packet p_odd = pmadd(p9, x2, p7);
|
|
1364
|
-
p_even = pmadd(p_even, x2, p4);
|
|
1365
|
-
p_odd = pmadd(p_odd, x2, p5);
|
|
1366
|
-
p_even = pmadd(p_even, x2, p2);
|
|
1367
|
-
p_odd = pmadd(p_odd, x2, p3);
|
|
1368
|
-
p_even = pmadd(p_even, x2, p0);
|
|
1369
|
-
p_odd = pmadd(p_odd, x2, p1);
|
|
1370
|
-
Packet p = pmadd(p_odd, x, p_even);
|
|
1371
|
-
|
|
1372
|
-
// Evaluate the remaining terms of Q(x) with extra precision using
|
|
1373
|
-
// double word arithmetic.
|
|
1374
|
-
Packet p_hi, p_lo;
|
|
1375
|
-
// x * p(x)
|
|
1376
|
-
twoprod(p, x, p_hi, p_lo);
|
|
1377
|
-
// C + x * p(x)
|
|
1378
|
-
Packet q1_hi, q1_lo;
|
|
1379
|
-
twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
|
|
1380
|
-
// x * (C + x * p(x))
|
|
1381
|
-
Packet q2_hi, q2_lo;
|
|
1382
|
-
twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
|
|
1383
|
-
// 1 + x * (C + x * p(x))
|
|
1384
|
-
Packet q3_hi, q3_lo;
|
|
1385
|
-
// Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
|
|
1386
|
-
// for adding it to unity here.
|
|
1387
|
-
fast_twosum(one, q2_hi, q3_hi, q3_lo);
|
|
1388
|
-
return padd(q3_hi, padd(q2_lo, q3_lo));
|
|
1389
|
-
}
|
|
1390
|
-
};
|
|
1391
|
-
|
|
1392
2121
|
// This function implements the non-trivial case of pow(x,y) where x is
|
|
1393
2122
|
// positive and y is (possibly) non-integer.
|
|
1394
2123
|
// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
|
|
1395
2124
|
// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
|
|
1396
2125
|
// easier to specialize or turn off for specific types and/or backends.x
|
|
1397
2126
|
template <typename Packet>
|
|
1398
|
-
EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
|
|
2127
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
|
|
1399
2128
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1400
2129
|
// Split x into exponent e_x and mantissa m_x.
|
|
1401
2130
|
Packet e_x;
|
|
1402
2131
|
Packet m_x = pfrexp(x, e_x);
|
|
1403
2132
|
|
|
1404
2133
|
// Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
|
|
1405
|
-
|
|
2134
|
+
constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
|
|
1406
2135
|
const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
|
|
1407
2136
|
m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
|
|
1408
2137
|
e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
|
|
@@ -1435,215 +2164,471 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
|
|
|
1435
2164
|
|
|
1436
2165
|
// We now have an accurate split of f = n_z + r_z and can compute
|
|
1437
2166
|
// x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
|
|
1438
|
-
//
|
|
1439
|
-
//
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
2167
|
+
// Multiplication by the second factor can be done exactly using pldexp(), since
|
|
2168
|
+
// it is an integer power of 2.
|
|
2169
|
+
const Packet e_r = generic_exp2(r_z);
|
|
2170
|
+
|
|
2171
|
+
// Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
|
|
2172
|
+
// of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
|
|
2173
|
+
constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
|
|
2174
|
+
const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
|
|
2175
|
+
if (predux_any(pldexp_fast_unsafe)) {
|
|
2176
|
+
return pldexp(e_r, n_z);
|
|
2177
|
+
}
|
|
2178
|
+
return pldexp_fast(e_r, n_z);
|
|
1443
2179
|
}
|
|
1444
2180
|
|
|
1445
2181
|
// Generic implementation of pow(x,y).
|
|
1446
|
-
template<typename Packet>
|
|
1447
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
1448
|
-
|
|
1449
|
-
Packet generic_pow(const Packet& x, const Packet& y) {
|
|
2182
|
+
template <typename Packet>
|
|
2183
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
|
|
2184
|
+
const Packet& x, const Packet& y) {
|
|
1450
2185
|
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
1451
2186
|
|
|
1452
|
-
const Packet
|
|
2187
|
+
const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
|
|
1453
2188
|
const Packet cst_zero = pset1<Packet>(Scalar(0));
|
|
1454
2189
|
const Packet cst_one = pset1<Packet>(Scalar(1));
|
|
1455
2190
|
const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
|
|
1456
2191
|
|
|
1457
|
-
const Packet
|
|
2192
|
+
const Packet x_abs = pabs(x);
|
|
2193
|
+
Packet pow = generic_pow_impl(x_abs, y);
|
|
2194
|
+
|
|
2195
|
+
// In the following we enforce the special case handling prescribed in
|
|
2196
|
+
// https://en.cppreference.com/w/cpp/numeric/math/pow.
|
|
2197
|
+
|
|
1458
2198
|
// Predicates for sign and magnitude of x.
|
|
2199
|
+
const Packet x_is_negative = pcmp_lt(x, cst_zero);
|
|
1459
2200
|
const Packet x_is_zero = pcmp_eq(x, cst_zero);
|
|
1460
|
-
const Packet
|
|
1461
|
-
const Packet
|
|
1462
|
-
const Packet
|
|
1463
|
-
const Packet
|
|
1464
|
-
const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
|
|
1465
|
-
const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
|
|
1466
|
-
const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
|
|
1467
|
-
const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
|
|
2201
|
+
const Packet x_is_one = pcmp_eq(x, cst_one);
|
|
2202
|
+
const Packet x_has_signbit = psignbit(x);
|
|
2203
|
+
const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
|
|
2204
|
+
const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
|
|
1468
2205
|
|
|
1469
2206
|
// Predicates for sign and magnitude of y.
|
|
1470
|
-
const Packet
|
|
2207
|
+
const Packet y_abs = pabs(y);
|
|
2208
|
+
const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
|
|
2209
|
+
const Packet y_is_negative = pcmp_lt(y, cst_zero);
|
|
1471
2210
|
const Packet y_is_zero = pcmp_eq(y, cst_zero);
|
|
1472
|
-
const Packet
|
|
1473
|
-
|
|
1474
|
-
const Packet
|
|
1475
|
-
const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
|
|
1476
|
-
EIGEN_CONSTEXPR Scalar huge_exponent =
|
|
1477
|
-
(NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
|
|
1478
|
-
NumTraits<Scalar>::epsilon();
|
|
1479
|
-
const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
|
|
1480
|
-
|
|
1481
|
-
// Predicates for whether y is integer and/or even.
|
|
1482
|
-
const Packet y_is_int = pcmp_eq(pfloor(y), y);
|
|
2211
|
+
const Packet y_is_one = pcmp_eq(y, cst_one);
|
|
2212
|
+
// Predicates for whether y is integer and odd/even.
|
|
2213
|
+
const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
|
|
1483
2214
|
const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
|
|
1484
2215
|
const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
|
|
2216
|
+
const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
|
|
2217
|
+
// Smallest exponent for which (1 + epsilon) overflows to infinity.
|
|
2218
|
+
constexpr Scalar huge_exponent =
|
|
2219
|
+
(NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
|
|
2220
|
+
const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
|
|
2221
|
+
|
|
2222
|
+
// * pow(base, exp) returns NaN if base is finite and negative
|
|
2223
|
+
// and exp is finite and non-integer.
|
|
2224
|
+
pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
|
|
2225
|
+
|
|
2226
|
+
// * pow(±0, exp), where exp is negative, finite, and is an even integer or
|
|
2227
|
+
// a non-integer, returns +∞
|
|
2228
|
+
// * pow(±0, exp), where exp is positive non-integer or a positive even
|
|
2229
|
+
// integer, returns +0
|
|
2230
|
+
// * pow(+0, exp), where exp is a negative odd integer, returns +∞
|
|
2231
|
+
// * pow(-0, exp), where exp is a negative odd integer, returns -∞
|
|
2232
|
+
// * pow(+0, exp), where exp is a positive odd integer, returns +0
|
|
2233
|
+
// * pow(-0, exp), where exp is a positive odd integer, returns -0
|
|
2234
|
+
// Sign is flipped by the rule below.
|
|
2235
|
+
pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
|
|
2236
|
+
|
|
2237
|
+
// pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
|
|
2238
|
+
// and exp is an odd integer exponent.
|
|
2239
|
+
pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
|
|
2240
|
+
|
|
2241
|
+
// * pow(base, -∞) returns +∞ for any |base|<1
|
|
2242
|
+
// * pow(base, -∞) returns +0 for any |base|>1
|
|
2243
|
+
// * pow(base, +∞) returns +0 for any |base|<1
|
|
2244
|
+
// * pow(base, +∞) returns +∞ for any |base|>1
|
|
2245
|
+
// * pow(±0, -∞) returns +∞
|
|
2246
|
+
// * pow(-1, +-∞) = 1
|
|
2247
|
+
Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
|
|
2248
|
+
inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
|
|
2249
|
+
pow = pselect(y_abs_is_huge, inf_y_val, pow);
|
|
2250
|
+
|
|
2251
|
+
// * pow(+∞, exp) returns +0 for any negative exp
|
|
2252
|
+
// * pow(+∞, exp) returns +∞ for any positive exp
|
|
2253
|
+
// * pow(-∞, exp) returns -0 if exp is a negative odd integer.
|
|
2254
|
+
// * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
|
|
2255
|
+
// even integer.
|
|
2256
|
+
// * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
|
|
2257
|
+
// * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
|
|
2258
|
+
// even integer.
|
|
2259
|
+
auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
|
|
2260
|
+
auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
|
|
2261
|
+
pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
|
|
2262
|
+
|
|
2263
|
+
// All cases of NaN inputs return NaN, except the two below.
|
|
2264
|
+
pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
|
|
2265
|
+
|
|
2266
|
+
// * pow(base, 1) returns base.
|
|
2267
|
+
// * pow(base, +/-0) returns 1, regardless of base, even NaN.
|
|
2268
|
+
// * pow(+1, exp) returns 1, regardless of exponent, even NaN.
|
|
2269
|
+
pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
|
|
2270
|
+
|
|
2271
|
+
return pow;
|
|
2272
|
+
}
|
|
1485
2273
|
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
const Packet pow_is_one = por(por(x_is_one, y_is_zero),
|
|
1491
|
-
pand(x_is_neg_one,
|
|
1492
|
-
por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
|
|
1493
|
-
const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
|
|
1494
|
-
const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
|
|
1495
|
-
pand(abs_x_is_inf, y_is_neg)),
|
|
1496
|
-
pand(pand(abs_x_is_lt_one, abs_y_is_huge),
|
|
1497
|
-
y_is_pos)),
|
|
1498
|
-
pand(pand(abs_x_is_gt_one, abs_y_is_huge),
|
|
1499
|
-
y_is_neg));
|
|
1500
|
-
const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
|
|
1501
|
-
pand(abs_x_is_inf, y_is_pos)),
|
|
1502
|
-
pand(pand(abs_x_is_lt_one, abs_y_is_huge),
|
|
1503
|
-
y_is_neg)),
|
|
1504
|
-
pand(pand(abs_x_is_gt_one, abs_y_is_huge),
|
|
1505
|
-
y_is_pos));
|
|
1506
|
-
|
|
1507
|
-
// General computation of pow(x,y) for positive x or negative x and integer y.
|
|
1508
|
-
const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
|
|
1509
|
-
const Packet pow_abs = generic_pow_impl(abs_x, y);
|
|
1510
|
-
return pselect(y_is_one, x,
|
|
1511
|
-
pselect(pow_is_one, cst_one,
|
|
1512
|
-
pselect(pow_is_nan, cst_nan,
|
|
1513
|
-
pselect(pow_is_inf, cst_pos_inf,
|
|
1514
|
-
pselect(pow_is_zero, cst_zero,
|
|
1515
|
-
pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
|
|
2274
|
+
template <typename Scalar>
|
|
2275
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
|
|
2276
|
+
const Scalar& x, const Scalar& y) {
|
|
2277
|
+
return numext::pow(x, y);
|
|
1516
2278
|
}
|
|
1517
2279
|
|
|
2280
|
+
namespace unary_pow {
|
|
1518
2281
|
|
|
2282
|
+
template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
|
|
2283
|
+
struct exponent_helper {
|
|
2284
|
+
using safe_abs_type = ScalarExponent;
|
|
2285
|
+
static constexpr ScalarExponent one_half = ScalarExponent(0.5);
|
|
2286
|
+
// these routines assume that exp is an integer stored as a floating point type
|
|
2287
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
|
|
2288
|
+
return numext::abs(exp);
|
|
2289
|
+
}
|
|
2290
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
|
|
2291
|
+
eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
|
|
2292
|
+
ScalarExponent exp_div_2 = exp * one_half;
|
|
2293
|
+
ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
|
|
2294
|
+
return exp_div_2 != floor_exp_div_2;
|
|
2295
|
+
}
|
|
2296
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
|
|
2297
|
+
ScalarExponent exp_div_2 = exp * one_half;
|
|
2298
|
+
return numext::floor(exp_div_2);
|
|
2299
|
+
}
|
|
2300
|
+
};
|
|
1519
2301
|
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
* DESCRIPTION:
|
|
1536
|
-
*
|
|
1537
|
-
* Evaluates polynomial of degree N:
|
|
1538
|
-
*
|
|
1539
|
-
* 2 N
|
|
1540
|
-
* y = C + C x + C x +...+ C x
|
|
1541
|
-
* 0 1 2 N
|
|
1542
|
-
*
|
|
1543
|
-
* Coefficients are stored in reverse order:
|
|
1544
|
-
*
|
|
1545
|
-
* coef[0] = C , ..., coef[N] = C .
|
|
1546
|
-
* N 0
|
|
1547
|
-
*
|
|
1548
|
-
* The function p1evl() assumes that coef[N] = 1.0 and is
|
|
1549
|
-
* omitted from the array. Its calling arguments are
|
|
1550
|
-
* otherwise the same as polevl().
|
|
1551
|
-
*
|
|
1552
|
-
*
|
|
1553
|
-
* The Eigen implementation is templatized. For best speed, store
|
|
1554
|
-
* coef as a const array (constexpr), e.g.
|
|
1555
|
-
*
|
|
1556
|
-
* const double coef[] = {1.0, 2.0, 3.0, ...};
|
|
1557
|
-
*
|
|
1558
|
-
*/
|
|
1559
|
-
template <typename Packet, int N>
|
|
1560
|
-
struct ppolevl {
|
|
1561
|
-
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
|
|
1562
|
-
EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
|
1563
|
-
return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
|
|
2302
|
+
template <typename ScalarExponent>
|
|
2303
|
+
struct exponent_helper<ScalarExponent, true> {
|
|
2304
|
+
// if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
|
|
2305
|
+
// consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
|
|
2306
|
+
using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
|
|
2307
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
|
|
2308
|
+
ScalarExponent mask = numext::signbit(exp);
|
|
2309
|
+
safe_abs_type result = safe_abs_type(exp ^ mask);
|
|
2310
|
+
return result + safe_abs_type(ScalarExponent(1) & mask);
|
|
2311
|
+
}
|
|
2312
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
|
|
2313
|
+
return exp % safe_abs_type(2) != safe_abs_type(0);
|
|
2314
|
+
}
|
|
2315
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
|
|
2316
|
+
return exp >> safe_abs_type(1);
|
|
1564
2317
|
}
|
|
1565
2318
|
};
|
|
1566
2319
|
|
|
1567
|
-
template <typename Packet
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
2320
|
+
template <typename Packet, typename ScalarExponent,
|
|
2321
|
+
bool ReciprocateIfExponentIsNegative =
|
|
2322
|
+
!NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
|
|
2323
|
+
struct reciprocate {
|
|
2324
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
|
|
2325
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2326
|
+
const Packet cst_pos_one = pset1<Packet>(Scalar(1));
|
|
2327
|
+
return exponent < 0 ? pdiv(cst_pos_one, x) : x;
|
|
1572
2328
|
}
|
|
1573
2329
|
};
|
|
1574
2330
|
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
* SYNOPSIS:
|
|
1582
|
-
*
|
|
1583
|
-
* int N;
|
|
1584
|
-
* Scalar x, y, coef[N], chebevl();
|
|
1585
|
-
*
|
|
1586
|
-
* y = chbevl( x, coef, N );
|
|
1587
|
-
*
|
|
1588
|
-
*
|
|
1589
|
-
*
|
|
1590
|
-
* DESCRIPTION:
|
|
1591
|
-
*
|
|
1592
|
-
* Evaluates the series
|
|
1593
|
-
*
|
|
1594
|
-
* N-1
|
|
1595
|
-
* - '
|
|
1596
|
-
* y = > coef[i] T (x/2)
|
|
1597
|
-
* - i
|
|
1598
|
-
* i=0
|
|
1599
|
-
*
|
|
1600
|
-
* of Chebyshev polynomials Ti at argument x/2.
|
|
1601
|
-
*
|
|
1602
|
-
* Coefficients are stored in reverse order, i.e. the zero
|
|
1603
|
-
* order term is last in the array. Note N is the number of
|
|
1604
|
-
* coefficients, not the order.
|
|
1605
|
-
*
|
|
1606
|
-
* If coefficients are for the interval a to b, x must
|
|
1607
|
-
* have been transformed to x -> 2(2x - b - a)/(b-a) before
|
|
1608
|
-
* entering the routine. This maps x from (a, b) to (-1, 1),
|
|
1609
|
-
* over which the Chebyshev polynomials are defined.
|
|
1610
|
-
*
|
|
1611
|
-
* If the coefficients are for the inverted interval, in
|
|
1612
|
-
* which (a, b) is mapped to (1/b, 1/a), the transformation
|
|
1613
|
-
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
|
|
1614
|
-
* this becomes x -> 4a/x - 1.
|
|
1615
|
-
*
|
|
1616
|
-
*
|
|
1617
|
-
*
|
|
1618
|
-
* SPEED:
|
|
1619
|
-
*
|
|
1620
|
-
* Taking advantage of the recurrence properties of the
|
|
1621
|
-
* Chebyshev polynomials, the routine requires one more
|
|
1622
|
-
* addition per loop than evaluating a nested polynomial of
|
|
1623
|
-
* the same degree.
|
|
1624
|
-
*
|
|
1625
|
-
*/
|
|
2331
|
+
template <typename Packet, typename ScalarExponent>
|
|
2332
|
+
struct reciprocate<Packet, ScalarExponent, false> {
|
|
2333
|
+
// pdiv not defined, nor necessary for integer base types
|
|
2334
|
+
// if the exponent is unsigned, then the exponent cannot be negative
|
|
2335
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
|
|
2336
|
+
};
|
|
1626
2337
|
|
|
1627
|
-
template <typename Packet,
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
2338
|
+
template <typename Packet, typename ScalarExponent>
|
|
2339
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
|
|
2340
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2341
|
+
using ExponentHelper = exponent_helper<ScalarExponent>;
|
|
2342
|
+
using AbsExponentType = typename ExponentHelper::safe_abs_type;
|
|
2343
|
+
const Packet cst_pos_one = pset1<Packet>(Scalar(1));
|
|
2344
|
+
if (exponent == ScalarExponent(0)) return cst_pos_one;
|
|
2345
|
+
|
|
2346
|
+
Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
|
|
2347
|
+
Packet y = cst_pos_one;
|
|
2348
|
+
AbsExponentType m = ExponentHelper::safe_abs(exponent);
|
|
2349
|
+
|
|
2350
|
+
while (m > 1) {
|
|
2351
|
+
bool odd = ExponentHelper::is_odd(m);
|
|
2352
|
+
if (odd) y = pmul(y, result);
|
|
2353
|
+
result = pmul(result, result);
|
|
2354
|
+
m = ExponentHelper::floor_div_two(m);
|
|
2355
|
+
}
|
|
1635
2356
|
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
2357
|
+
return pmul(y, result);
|
|
2358
|
+
}
|
|
2359
|
+
|
|
2360
|
+
template <typename Packet>
|
|
2361
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
|
|
2362
|
+
const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
|
|
2363
|
+
const Packet exponent_packet = pset1<Packet>(exponent);
|
|
2364
|
+
return generic_pow_impl(x, exponent_packet);
|
|
2365
|
+
}
|
|
2366
|
+
|
|
2367
|
+
template <typename Scalar>
|
|
2368
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
|
|
2369
|
+
const Scalar& x, const Scalar& exponent) {
|
|
2370
|
+
return numext::pow(x, exponent);
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
template <typename Packet, typename ScalarExponent>
|
|
2374
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
|
|
2375
|
+
const ScalarExponent& exponent) {
|
|
2376
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2377
|
+
|
|
2378
|
+
// non-integer base and exponent case
|
|
2379
|
+
const Packet cst_pos_zero = pzero(x);
|
|
2380
|
+
const Packet cst_pos_one = pset1<Packet>(Scalar(1));
|
|
2381
|
+
const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
|
|
2382
|
+
const Packet cst_true = ptrue<Packet>(x);
|
|
2383
|
+
|
|
2384
|
+
const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
|
|
2385
|
+
const bool exponent_is_neg = exponent < ScalarExponent(0);
|
|
2386
|
+
const bool exponent_is_pos = exponent > ScalarExponent(0);
|
|
2387
|
+
|
|
2388
|
+
const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
|
|
2389
|
+
const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
|
|
2390
|
+
const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
|
|
2391
|
+
const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
|
|
2392
|
+
const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
|
|
2393
|
+
|
|
2394
|
+
const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
|
|
2395
|
+
const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
|
|
2396
|
+
const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
|
|
2397
|
+
|
|
2398
|
+
const Packet abs_x = pabs(x);
|
|
2399
|
+
const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
|
|
2400
|
+
const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
|
|
2401
|
+
const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
|
|
2402
|
+
const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
|
|
2403
|
+
|
|
2404
|
+
Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
|
|
2405
|
+
Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
|
|
2406
|
+
Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
|
|
2407
|
+
|
|
2408
|
+
Packet result = powx;
|
|
2409
|
+
result = por(x_is_le_zero, result);
|
|
2410
|
+
result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
|
|
2411
|
+
result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
|
|
2412
|
+
result = por(exp_is_nan, result);
|
|
2413
|
+
result = pselect(pow_is_one, cst_pos_one, result);
|
|
2414
|
+
return result;
|
|
2415
|
+
}
|
|
2416
|
+
|
|
2417
|
+
template <typename Packet, typename ScalarExponent,
|
|
2418
|
+
std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
|
|
2419
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
|
|
2420
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2421
|
+
|
|
2422
|
+
// signed integer base, signed integer exponent case
|
|
2423
|
+
|
|
2424
|
+
// This routine handles negative exponents.
|
|
2425
|
+
// The return value is either 0, 1, or -1.
|
|
2426
|
+
const Packet cst_pos_one = pset1<Packet>(Scalar(1));
|
|
2427
|
+
const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
|
|
2428
|
+
const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
|
|
2429
|
+
|
|
2430
|
+
const Packet abs_x = pabs(x);
|
|
2431
|
+
const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
|
|
2432
|
+
|
|
2433
|
+
Packet result = pselect(exp_is_odd, x, abs_x);
|
|
2434
|
+
result = pselect(abs_x_is_one, result, pzero<Packet>(x));
|
|
2435
|
+
return result;
|
|
2436
|
+
}
|
|
2437
|
+
|
|
2438
|
+
template <typename Packet, typename ScalarExponent,
|
|
2439
|
+
std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
|
|
2440
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
|
|
2441
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2442
|
+
|
|
2443
|
+
// unsigned integer base, signed integer exponent case
|
|
2444
|
+
|
|
2445
|
+
// This routine handles negative exponents.
|
|
2446
|
+
// The return value is either 0 or 1
|
|
2447
|
+
|
|
2448
|
+
const Scalar pos_one = Scalar(1);
|
|
2449
|
+
|
|
2450
|
+
const Packet cst_pos_one = pset1<Packet>(pos_one);
|
|
2451
|
+
|
|
2452
|
+
const Packet x_is_one = pcmp_eq(x, cst_pos_one);
|
|
2453
|
+
|
|
2454
|
+
return pand(x_is_one, x);
|
|
2455
|
+
}
|
|
2456
|
+
|
|
2457
|
+
} // end namespace unary_pow
|
|
2458
|
+
|
|
2459
|
+
template <typename Packet, typename ScalarExponent,
|
|
2460
|
+
bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
|
|
2461
|
+
bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
|
|
2462
|
+
bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
|
|
2463
|
+
struct unary_pow_impl;
|
|
2464
|
+
|
|
2465
|
+
template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
|
|
2466
|
+
struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
|
|
2467
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
2468
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
|
|
2469
|
+
const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
|
|
2470
|
+
if (exponent_is_integer) {
|
|
2471
|
+
// The simple recursive doubling implementation is only accurate to 3 ulps
|
|
2472
|
+
// for integer exponents in [-3:7]. Since this is a common case, we
|
|
2473
|
+
// specialize it here.
|
|
2474
|
+
bool use_repeated_squaring =
|
|
2475
|
+
(exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
|
|
2476
|
+
return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
|
|
2477
|
+
} else {
|
|
2478
|
+
Packet result = unary_pow::gen_pow(x, exponent);
|
|
2479
|
+
result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
|
|
2480
|
+
return result;
|
|
1640
2481
|
}
|
|
2482
|
+
}
|
|
2483
|
+
};
|
|
1641
2484
|
|
|
1642
|
-
|
|
2485
|
+
template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
|
|
2486
|
+
struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
|
|
2487
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
2488
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
|
|
2489
|
+
return unary_pow::int_pow(x, exponent);
|
|
1643
2490
|
}
|
|
1644
2491
|
};
|
|
1645
2492
|
|
|
1646
|
-
|
|
1647
|
-
|
|
2493
|
+
template <typename Packet, typename ScalarExponent>
|
|
2494
|
+
struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
|
|
2495
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
2496
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
|
|
2497
|
+
if (exponent < ScalarExponent(0)) {
|
|
2498
|
+
return unary_pow::handle_negative_exponent(x, exponent);
|
|
2499
|
+
} else {
|
|
2500
|
+
return unary_pow::int_pow(x, exponent);
|
|
2501
|
+
}
|
|
2502
|
+
}
|
|
2503
|
+
};
|
|
2504
|
+
|
|
2505
|
+
template <typename Packet, typename ScalarExponent>
|
|
2506
|
+
struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
|
|
2507
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
2508
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
|
|
2509
|
+
return unary_pow::int_pow(x, exponent);
|
|
2510
|
+
}
|
|
2511
|
+
};
|
|
2512
|
+
|
|
2513
|
+
// This function computes exp2(x) = exp(ln(2) * x).
|
|
2514
|
+
// To improve accuracy, the product ln(2)*x is computed using the twoprod
|
|
2515
|
+
// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
|
|
2516
|
+
// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
|
|
2517
|
+
// correction step this reduces the maximum absolute error as follows:
|
|
2518
|
+
//
|
|
2519
|
+
// type | max error (simple product) | max error (twoprod) |
|
|
2520
|
+
// -----------------------------------------------------------
|
|
2521
|
+
// float | 35 ulps | 4 ulps |
|
|
2522
|
+
// double | 363 ulps | 110 ulps |
|
|
2523
|
+
//
|
|
2524
|
+
template <typename Packet>
|
|
2525
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
|
|
2526
|
+
typedef typename unpacket_traits<Packet>::type Scalar;
|
|
2527
|
+
constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
|
|
2528
|
+
constexpr int digits = std::numeric_limits<Scalar>::digits;
|
|
2529
|
+
constexpr Scalar max_cap = Scalar(max_exponent + 1);
|
|
2530
|
+
constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
|
|
2531
|
+
Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
|
|
2532
|
+
Packet p_hi, p_lo;
|
|
2533
|
+
twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
|
|
2534
|
+
Packet exp2_hi = pexp(p_hi);
|
|
2535
|
+
Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
|
|
2536
|
+
return pmul(exp2_hi, exp2_lo);
|
|
2537
|
+
}
|
|
2538
|
+
|
|
2539
|
+
template <typename Packet>
|
|
2540
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
|
|
2541
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2542
|
+
using IntType = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
|
|
2543
|
+
// Adds and subtracts signum(a) * 2^kMantissaBits to force rounding.
|
|
2544
|
+
const IntType kLimit = IntType(1) << (NumTraits<Scalar>::digits() - 1);
|
|
2545
|
+
const Packet cst_limit = pset1<Packet>(static_cast<Scalar>(kLimit));
|
|
2546
|
+
Packet abs_a = pabs(a);
|
|
2547
|
+
Packet sign_a = pandnot(a, abs_a);
|
|
2548
|
+
Packet rint_a = padd(abs_a, cst_limit);
|
|
2549
|
+
// Don't compile-away addition and subtraction.
|
|
2550
|
+
EIGEN_OPTIMIZATION_BARRIER(rint_a);
|
|
2551
|
+
rint_a = psub(rint_a, cst_limit);
|
|
2552
|
+
rint_a = por(rint_a, sign_a);
|
|
2553
|
+
// If greater than limit (or NaN), simply return a.
|
|
2554
|
+
Packet mask = pcmp_lt(abs_a, cst_limit);
|
|
2555
|
+
Packet result = pselect(mask, rint_a, a);
|
|
2556
|
+
return result;
|
|
2557
|
+
}
|
|
2558
|
+
|
|
2559
|
+
template <typename Packet>
|
|
2560
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
|
|
2561
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2562
|
+
const Packet cst_1 = pset1<Packet>(Scalar(1));
|
|
2563
|
+
Packet rint_a = generic_rint(a);
|
|
2564
|
+
// if a < rint(a), then rint(a) == ceil(a)
|
|
2565
|
+
Packet mask = pcmp_lt(a, rint_a);
|
|
2566
|
+
Packet offset = pand(cst_1, mask);
|
|
2567
|
+
Packet result = psub(rint_a, offset);
|
|
2568
|
+
return result;
|
|
2569
|
+
}
|
|
2570
|
+
|
|
2571
|
+
template <typename Packet>
|
|
2572
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
|
|
2573
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2574
|
+
const Packet cst_1 = pset1<Packet>(Scalar(1));
|
|
2575
|
+
const Packet sign_mask = pset1<Packet>(static_cast<Scalar>(-0.0));
|
|
2576
|
+
Packet rint_a = generic_rint(a);
|
|
2577
|
+
// if rint(a) < a, then rint(a) == floor(a)
|
|
2578
|
+
Packet mask = pcmp_lt(rint_a, a);
|
|
2579
|
+
Packet offset = pand(cst_1, mask);
|
|
2580
|
+
Packet result = padd(rint_a, offset);
|
|
2581
|
+
// Signed zero must remain signed (e.g. ceil(-0.02) == -0).
|
|
2582
|
+
result = por(result, pand(sign_mask, a));
|
|
2583
|
+
return result;
|
|
2584
|
+
}
|
|
2585
|
+
|
|
2586
|
+
template <typename Packet>
|
|
2587
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
|
|
2588
|
+
Packet abs_a = pabs(a);
|
|
2589
|
+
Packet sign_a = pandnot(a, abs_a);
|
|
2590
|
+
Packet floor_abs_a = generic_floor(abs_a);
|
|
2591
|
+
Packet result = por(floor_abs_a, sign_a);
|
|
2592
|
+
return result;
|
|
2593
|
+
}
|
|
2594
|
+
|
|
2595
|
+
template <typename Packet>
|
|
2596
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
|
|
2597
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2598
|
+
const Packet cst_half = pset1<Packet>(Scalar(0.5));
|
|
2599
|
+
const Packet cst_1 = pset1<Packet>(Scalar(1));
|
|
2600
|
+
Packet abs_a = pabs(a);
|
|
2601
|
+
Packet sign_a = pandnot(a, abs_a);
|
|
2602
|
+
Packet floor_abs_a = generic_floor(abs_a);
|
|
2603
|
+
Packet diff = psub(abs_a, floor_abs_a);
|
|
2604
|
+
Packet mask = pcmp_le(cst_half, diff);
|
|
2605
|
+
Packet offset = pand(cst_1, mask);
|
|
2606
|
+
Packet result = padd(floor_abs_a, offset);
|
|
2607
|
+
result = por(result, sign_a);
|
|
2608
|
+
return result;
|
|
2609
|
+
}
|
|
2610
|
+
|
|
2611
|
+
template <typename Packet>
|
|
2612
|
+
struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ false> {
|
|
2613
|
+
using Scalar = typename unpacket_traits<Packet>::type;
|
|
2614
|
+
static_assert(packet_traits<Scalar>::HasRound, "Generic nearest integer functions are disabled for this type.");
|
|
2615
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return generic_floor(x); }
|
|
2616
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return generic_ceil(x); }
|
|
2617
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return generic_rint(x); }
|
|
2618
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return generic_round(x); }
|
|
2619
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return generic_trunc(x); }
|
|
2620
|
+
};
|
|
2621
|
+
|
|
2622
|
+
template <typename Packet>
|
|
2623
|
+
struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ true> {
|
|
2624
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return x; }
|
|
2625
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return x; }
|
|
2626
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return x; }
|
|
2627
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return x; }
|
|
2628
|
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return x; }
|
|
2629
|
+
};
|
|
2630
|
+
|
|
2631
|
+
} // end namespace internal
|
|
2632
|
+
} // end namespace Eigen
|
|
1648
2633
|
|
|
1649
|
-
#endif
|
|
2634
|
+
#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
|