tomoto 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +8 -10
- data/ext/tomoto/ct.cpp +11 -11
- data/ext/tomoto/dmr.cpp +14 -13
- data/ext/tomoto/dt.cpp +14 -14
- data/ext/tomoto/extconf.rb +7 -5
- data/ext/tomoto/gdmr.cpp +7 -7
- data/ext/tomoto/hdp.cpp +9 -9
- data/ext/tomoto/hlda.cpp +13 -13
- data/ext/tomoto/hpa.cpp +5 -5
- data/ext/tomoto/lda.cpp +42 -39
- data/ext/tomoto/llda.cpp +6 -6
- data/ext/tomoto/mglda.cpp +15 -15
- data/ext/tomoto/pa.cpp +6 -6
- data/ext/tomoto/plda.cpp +6 -6
- data/ext/tomoto/slda.cpp +8 -8
- data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
- data/ext/tomoto/utils.h +16 -70
- data/lib/tomoto/version.rb +1 -1
- data/lib/tomoto.rb +5 -1
- data/vendor/EigenRand/EigenRand/Core.h +10 -10
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
- data/vendor/EigenRand/EigenRand/EigenRand +11 -6
- data/vendor/EigenRand/EigenRand/Macro.h +13 -7
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
- data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
- data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
- data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
- data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
- data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
- data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
- data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
- data/vendor/EigenRand/EigenRand/doc.h +24 -12
- data/vendor/EigenRand/README.md +57 -4
- data/vendor/eigen/COPYING.APACHE +203 -0
- data/vendor/eigen/COPYING.BSD +1 -1
- data/vendor/eigen/COPYING.MINPACK +51 -52
- data/vendor/eigen/Eigen/Cholesky +0 -1
- data/vendor/eigen/Eigen/Core +112 -265
- data/vendor/eigen/Eigen/Eigenvalues +2 -3
- data/vendor/eigen/Eigen/Geometry +5 -8
- data/vendor/eigen/Eigen/Householder +0 -1
- data/vendor/eigen/Eigen/Jacobi +0 -1
- data/vendor/eigen/Eigen/KLUSupport +41 -0
- data/vendor/eigen/Eigen/LU +2 -5
- data/vendor/eigen/Eigen/OrderingMethods +0 -3
- data/vendor/eigen/Eigen/PaStiXSupport +1 -0
- data/vendor/eigen/Eigen/PardisoSupport +0 -0
- data/vendor/eigen/Eigen/QR +2 -3
- data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
- data/vendor/eigen/Eigen/SVD +0 -1
- data/vendor/eigen/Eigen/Sparse +0 -2
- data/vendor/eigen/Eigen/SparseCholesky +0 -8
- data/vendor/eigen/Eigen/SparseLU +4 -0
- data/vendor/eigen/Eigen/SparseQR +0 -1
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
- data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
- data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
- data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
- data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
- data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
- data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
- data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
- data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
- data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
- data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
- data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
- data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
- data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
- data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
- data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
- data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
- data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
- data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
- data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
- data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
- data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
- data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
- data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
- data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
- data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
- data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
- data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
- data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
- data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
- data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
- data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
- data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
- data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
- data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
- data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
- data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
- data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
- data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
- data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
- data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
- data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
- data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
- data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
- data/vendor/eigen/README.md +2 -0
- data/vendor/eigen/bench/btl/README +1 -1
- data/vendor/eigen/bench/tensors/README +6 -7
- data/vendor/eigen/ci/README.md +56 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
- data/vendor/eigen/unsupported/README.txt +1 -1
- data/vendor/tomotopy/README.kr.rst +78 -0
- data/vendor/tomotopy/README.rst +75 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
- data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
- data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
- data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
- data/vendor/tomotopy/src/Utils/exception.h +6 -0
- data/vendor/tomotopy/src/Utils/math.h +2 -2
- data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
- data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
- data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
- metadata +64 -18
- data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
* @file MorePacketMath.h
|
|
3
3
|
* @author bab2min (bab2min@gmail.com)
|
|
4
4
|
* @brief
|
|
5
|
-
* @version 0.3.
|
|
6
|
-
* @date
|
|
5
|
+
* @version 0.3.3
|
|
6
|
+
* @date 2021-03-31
|
|
7
7
|
*
|
|
8
|
-
* @copyright Copyright (c) 2020
|
|
8
|
+
* @copyright Copyright (c) 2020-2021
|
|
9
9
|
*
|
|
10
10
|
*/
|
|
11
11
|
|
|
@@ -14,14 +14,26 @@
|
|
|
14
14
|
|
|
15
15
|
#include <Eigen/Dense>
|
|
16
16
|
|
|
17
|
+
#define EIGENRAND_PRINT_PACKET(p) do { using _MTy = typename std::remove_const<typename std::remove_reference<decltype(p)>::type>::type; typename std::conditional<Eigen::internal::IsFloatPacket<_MTy>::value, float, typename std::conditional<Eigen::internal::IsDoublePacket<_MTy>::value, double, int>::type>::type f[4]; Eigen::internal::pstore(f, p); std::cout << #p " " << f[0] << " " << f[1] << " " << f[2] << " " << f[3] << std::endl; } while(0)
|
|
18
|
+
|
|
17
19
|
namespace Eigen
|
|
18
20
|
{
|
|
19
21
|
namespace internal
|
|
20
22
|
{
|
|
23
|
+
template<typename Ty>
|
|
24
|
+
struct IsIntPacket : std::false_type {};
|
|
25
|
+
|
|
26
|
+
template<typename Ty>
|
|
27
|
+
struct IsFloatPacket : std::false_type {};
|
|
28
|
+
|
|
29
|
+
template<typename Ty>
|
|
30
|
+
struct IsDoublePacket : std::false_type {};
|
|
31
|
+
|
|
32
|
+
template<typename Ty>
|
|
33
|
+
struct HalfPacket;
|
|
34
|
+
|
|
21
35
|
template<typename Packet>
|
|
22
|
-
struct reinterpreter
|
|
23
|
-
{
|
|
24
|
-
};
|
|
36
|
+
struct reinterpreter{};
|
|
25
37
|
|
|
26
38
|
template<typename Packet>
|
|
27
39
|
inline auto reinterpret_to_float(const Packet& x)
|
|
@@ -44,13 +56,40 @@ namespace Eigen
|
|
|
44
56
|
return reinterpreter<Packet>{}.to_int(x);
|
|
45
57
|
}
|
|
46
58
|
|
|
59
|
+
template<typename Packet>
|
|
60
|
+
EIGEN_STRONG_INLINE void split_two(const Packet& p, typename HalfPacket<Packet>::type& a, typename HalfPacket<Packet>::type& b);
|
|
61
|
+
|
|
47
62
|
template<typename Packet>
|
|
48
63
|
EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
|
|
49
64
|
|
|
65
|
+
template<typename Packet>
|
|
66
|
+
EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b);
|
|
67
|
+
|
|
68
|
+
template<typename Packet>
|
|
69
|
+
EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b);
|
|
70
|
+
|
|
71
|
+
template <typename SrcPacket, typename TgtPacket>
|
|
72
|
+
EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a);
|
|
73
|
+
|
|
50
74
|
template<typename Packet>
|
|
51
75
|
EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b);
|
|
52
76
|
|
|
53
77
|
template<typename Packet>
|
|
78
|
+
struct BitShifter {};
|
|
79
|
+
|
|
80
|
+
template<int b, typename Packet>
|
|
81
|
+
EIGEN_STRONG_INLINE Packet psll(const Packet& a);
|
|
82
|
+
|
|
83
|
+
template<int _b, typename Packet>
|
|
84
|
+
EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b = _b);
|
|
85
|
+
|
|
86
|
+
template<int b, typename Packet>
|
|
87
|
+
EIGEN_STRONG_INLINE Packet psll64(const Packet& a);
|
|
88
|
+
|
|
89
|
+
template<int b, typename Packet>
|
|
90
|
+
EIGEN_STRONG_INLINE Packet psrl64(const Packet& a);
|
|
91
|
+
|
|
92
|
+
/*template<typename Packet>
|
|
54
93
|
EIGEN_STRONG_INLINE Packet psll(const Packet& a, int b);
|
|
55
94
|
|
|
56
95
|
template<typename Packet>
|
|
@@ -60,12 +99,34 @@ namespace Eigen
|
|
|
60
99
|
EIGEN_STRONG_INLINE Packet psll64(const Packet& a, int b);
|
|
61
100
|
|
|
62
101
|
template<typename Packet>
|
|
63
|
-
EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b)
|
|
102
|
+
EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);*/
|
|
64
103
|
|
|
65
104
|
template<typename Packet>
|
|
66
105
|
EIGEN_STRONG_INLINE int pmovemask(const Packet& a);
|
|
67
106
|
|
|
68
|
-
template
|
|
107
|
+
template<typename Packet>
|
|
108
|
+
EIGEN_STRONG_INLINE typename std::enable_if<
|
|
109
|
+
IsFloatPacket<Packet>::value, Packet
|
|
110
|
+
>::type pext_sign(const Packet& a)
|
|
111
|
+
{
|
|
112
|
+
using IntPacket = decltype(reinterpret_to_int(a));
|
|
113
|
+
return reinterpret_to_float(
|
|
114
|
+
pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
template<typename Packet>
|
|
119
|
+
EIGEN_STRONG_INLINE typename std::enable_if<
|
|
120
|
+
IsDoublePacket<Packet>::value, Packet
|
|
121
|
+
>::type pext_sign(const Packet& a)
|
|
122
|
+
{
|
|
123
|
+
using IntPacket = decltype(reinterpret_to_int(a));
|
|
124
|
+
return reinterpret_to_double(
|
|
125
|
+
pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/*template<>
|
|
69
130
|
EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(const uint64_t& a, int b)
|
|
70
131
|
{
|
|
71
132
|
return a << b;
|
|
@@ -75,109 +136,11 @@ namespace Eigen
|
|
|
75
136
|
EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(const uint64_t& a, int b)
|
|
76
137
|
{
|
|
77
138
|
return a >> b;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
template<typename Packet>
|
|
81
|
-
EIGEN_STRONG_INLINE void psincos(Packet x, Packet &s, Packet &c)
|
|
82
|
-
{
|
|
83
|
-
Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
|
|
84
|
-
using IntPacket = decltype(reinterpret_to_int(x));
|
|
85
|
-
IntPacket emm0, emm2, emm4;
|
|
86
|
-
|
|
87
|
-
sign_bit_sin = x;
|
|
88
|
-
/* take the absolute value */
|
|
89
|
-
x = pabs(x);
|
|
90
|
-
/* extract the sign bit (upper one) */
|
|
91
|
-
sign_bit_sin = reinterpret_to_float(
|
|
92
|
-
pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
|
|
93
|
-
);
|
|
94
|
-
|
|
95
|
-
/* scale by 4/Pi */
|
|
96
|
-
y = pmul(x, pset1<Packet>(1.27323954473516));
|
|
97
|
-
|
|
98
|
-
/* store the integer part of y in emm2 */
|
|
99
|
-
emm2 = pcast<Packet, IntPacket>(y);
|
|
100
|
-
|
|
101
|
-
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
102
|
-
emm2 = padd(emm2, pset1<IntPacket>(1));
|
|
103
|
-
emm2 = pand(emm2, pset1<IntPacket>(~1));
|
|
104
|
-
y = pcast<IntPacket, Packet>(emm2);
|
|
105
|
-
|
|
106
|
-
emm4 = emm2;
|
|
107
|
-
|
|
108
|
-
/* get the swap sign flag for the sine */
|
|
109
|
-
emm0 = pand(emm2, pset1<IntPacket>(4));
|
|
110
|
-
emm0 = psll(emm0, 29);
|
|
111
|
-
Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
|
|
112
|
-
|
|
113
|
-
/* get the polynom selection mask for the sine*/
|
|
114
|
-
emm2 = pand(emm2, pset1<IntPacket>(2));
|
|
115
|
-
|
|
116
|
-
emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
|
|
117
|
-
Packet poly_mask = reinterpret_to_float(emm2);
|
|
118
|
-
|
|
119
|
-
/* The magic pass: "Extended precision modular arithmetic"
|
|
120
|
-
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
121
|
-
xmm1 = pset1<Packet>(-0.78515625);
|
|
122
|
-
xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
|
|
123
|
-
xmm3 = pset1<Packet>(-3.77489497744594108e-8);
|
|
124
|
-
xmm1 = pmul(y, xmm1);
|
|
125
|
-
xmm2 = pmul(y, xmm2);
|
|
126
|
-
xmm3 = pmul(y, xmm3);
|
|
127
|
-
x = padd(x, xmm1);
|
|
128
|
-
x = padd(x, xmm2);
|
|
129
|
-
x = padd(x, xmm3);
|
|
130
|
-
|
|
131
|
-
emm4 = psub(emm4, pset1<IntPacket>(2));
|
|
132
|
-
emm4 = pandnot(emm4, pset1<IntPacket>(4));
|
|
133
|
-
emm4 = psll(emm4, 29);
|
|
134
|
-
Packet sign_bit_cos = reinterpret_to_float(emm4);
|
|
135
|
-
sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
139
|
-
Packet z = pmul(x, x);
|
|
140
|
-
y = pset1<Packet>(2.443315711809948E-005);
|
|
141
|
-
|
|
142
|
-
y = pmul(y, z);
|
|
143
|
-
y = padd(y, pset1<Packet>(-1.388731625493765E-003));
|
|
144
|
-
y = pmul(y, z);
|
|
145
|
-
y = padd(y, pset1<Packet>(4.166664568298827E-002));
|
|
146
|
-
y = pmul(y, z);
|
|
147
|
-
y = pmul(y, z);
|
|
148
|
-
Packet tmp = pmul(z, pset1<Packet>(0.5));
|
|
149
|
-
y = psub(y, tmp);
|
|
150
|
-
y = padd(y, pset1<Packet>(1));
|
|
151
|
-
|
|
152
|
-
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
153
|
-
|
|
154
|
-
Packet y2 = pset1<Packet>(-1.9515295891E-4);
|
|
155
|
-
y2 = pmul(y2, z);
|
|
156
|
-
y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
|
|
157
|
-
y2 = pmul(y2, z);
|
|
158
|
-
y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
|
|
159
|
-
y2 = pmul(y2, z);
|
|
160
|
-
y2 = pmul(y2, x);
|
|
161
|
-
y2 = padd(y2, x);
|
|
162
|
-
|
|
163
|
-
/* select the correct result from the two polynoms */
|
|
164
|
-
xmm3 = poly_mask;
|
|
165
|
-
Packet ysin2 = pand(xmm3, y2);
|
|
166
|
-
Packet ysin1 = pandnot(xmm3, y);
|
|
167
|
-
y2 = psub(y2, ysin2);
|
|
168
|
-
y = psub(y, ysin1);
|
|
169
|
-
|
|
170
|
-
xmm1 = padd(ysin1, ysin2);
|
|
171
|
-
xmm2 = padd(y, y2);
|
|
172
|
-
|
|
173
|
-
/* update the sign */
|
|
174
|
-
s = pxor(xmm1, sign_bit_sin);
|
|
175
|
-
c = pxor(xmm2, sign_bit_cos);
|
|
176
|
-
}
|
|
139
|
+
}*/
|
|
177
140
|
|
|
178
141
|
// approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
|
|
179
142
|
template<typename Packet>
|
|
180
|
-
EIGEN_STRONG_INLINE Packet
|
|
143
|
+
EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x)
|
|
181
144
|
{
|
|
182
145
|
auto x_3 = padd(x, pset1<Packet>(3));
|
|
183
146
|
auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
|
|
@@ -195,6 +158,9 @@ namespace Eigen
|
|
|
195
158
|
template<typename Packet>
|
|
196
159
|
EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
|
|
197
160
|
|
|
161
|
+
template<typename Packet>
|
|
162
|
+
EIGEN_STRONG_INLINE Packet pbitnot(const Packet& a);
|
|
163
|
+
|
|
198
164
|
template<typename PacketIf, typename Packet>
|
|
199
165
|
EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
|
|
200
166
|
|
|
@@ -213,6 +179,9 @@ namespace Eigen
|
|
|
213
179
|
template<typename Packet>
|
|
214
180
|
EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
|
|
215
181
|
|
|
182
|
+
template<typename Packet>
|
|
183
|
+
EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b);
|
|
184
|
+
|
|
216
185
|
template<typename Packet>
|
|
217
186
|
EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
|
|
218
187
|
|
|
@@ -241,10 +210,10 @@ namespace Eigen
|
|
|
241
210
|
}
|
|
242
211
|
|
|
243
212
|
template<typename _Scalar>
|
|
244
|
-
struct
|
|
213
|
+
struct BitScalar;
|
|
245
214
|
|
|
246
215
|
template<>
|
|
247
|
-
struct
|
|
216
|
+
struct BitScalar<float>
|
|
248
217
|
{
|
|
249
218
|
float to_ur(uint32_t x)
|
|
250
219
|
{
|
|
@@ -264,7 +233,7 @@ namespace Eigen
|
|
|
264
233
|
};
|
|
265
234
|
|
|
266
235
|
template<>
|
|
267
|
-
struct
|
|
236
|
+
struct BitScalar<double>
|
|
268
237
|
{
|
|
269
238
|
double to_ur(uint64_t x)
|
|
270
239
|
{
|
|
@@ -291,720 +260,359 @@ namespace Eigen
|
|
|
291
260
|
|
|
292
261
|
EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
|
|
293
262
|
{
|
|
294
|
-
|
|
263
|
+
BitScalar<float> bs;
|
|
295
264
|
float2 ret;
|
|
296
265
|
ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
|
|
297
266
|
ret.f[1] = bs.to_ur(x >> 32);
|
|
298
267
|
return ret;
|
|
299
268
|
}
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
#ifdef EIGEN_VECTORIZE_AVX
|
|
304
|
-
#include <immintrin.h>
|
|
305
269
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
template<>
|
|
311
|
-
struct reinterpreter<Packet8i>
|
|
270
|
+
template<typename Packet>
|
|
271
|
+
EIGEN_STRONG_INLINE typename std::enable_if<
|
|
272
|
+
IsFloatPacket<Packet>::value
|
|
273
|
+
>::type psincos(Packet x, Packet& s, Packet& c)
|
|
312
274
|
{
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x)
|
|
319
|
-
{
|
|
320
|
-
return _mm256_castsi256_pd(x);
|
|
321
|
-
}
|
|
275
|
+
Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
|
|
276
|
+
using IntPacket = decltype(reinterpret_to_int(x));
|
|
277
|
+
IntPacket emm0, emm2, emm4;
|
|
322
278
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
279
|
+
sign_bit_sin = x;
|
|
280
|
+
/* take the absolute value */
|
|
281
|
+
x = pabs(x);
|
|
282
|
+
/* extract the sign bit (upper one) */
|
|
283
|
+
sign_bit_sin = pext_sign(sign_bit_sin);
|
|
328
284
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
{
|
|
332
|
-
EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x)
|
|
333
|
-
{
|
|
334
|
-
return x;
|
|
335
|
-
}
|
|
285
|
+
/* scale by 4/Pi */
|
|
286
|
+
y = pmul(x, pset1<Packet>(1.27323954473516));
|
|
336
287
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
return _mm256_castps_pd(x);
|
|
340
|
-
}
|
|
288
|
+
/* store the integer part of y in emm2 */
|
|
289
|
+
emm2 = pcast<Packet, IntPacket>(y);
|
|
341
290
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
};
|
|
291
|
+
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
292
|
+
emm2 = padd(emm2, pset1<IntPacket>(1));
|
|
293
|
+
emm2 = pand(emm2, pset1<IntPacket>(~1));
|
|
294
|
+
y = pcast<IntPacket, Packet>(emm2);
|
|
347
295
|
|
|
348
|
-
|
|
349
|
-
struct reinterpreter<Packet4d>
|
|
350
|
-
{
|
|
351
|
-
EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x)
|
|
352
|
-
{
|
|
353
|
-
return _mm256_castpd_ps(x);
|
|
354
|
-
}
|
|
296
|
+
emm4 = emm2;
|
|
355
297
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
298
|
+
/* get the swap sign flag for the sine */
|
|
299
|
+
emm0 = pand(emm2, pset1<IntPacket>(4));
|
|
300
|
+
emm0 = psll<29>(emm0);
|
|
301
|
+
Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
|
|
360
302
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
return _mm256_castpd_si256(x);
|
|
364
|
-
}
|
|
365
|
-
};
|
|
303
|
+
/* get the polynom selection mask for the sine*/
|
|
304
|
+
emm2 = pand(emm2, pset1<IntPacket>(2));
|
|
366
305
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
a = _mm256_extractf128_si256(x, 0);
|
|
370
|
-
b = _mm256_extractf128_si256(x, 1);
|
|
371
|
-
}
|
|
306
|
+
emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
|
|
307
|
+
Packet poly_mask = reinterpret_to_float(emm2);
|
|
372
308
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
309
|
+
/* The magic pass: "Extended precision modular arithmetic"
|
|
310
|
+
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
311
|
+
xmm1 = pset1<Packet>(-0.78515625);
|
|
312
|
+
xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
|
|
313
|
+
xmm3 = pset1<Packet>(-3.77489497744594108e-8);
|
|
314
|
+
xmm1 = pmul(y, xmm1);
|
|
315
|
+
xmm2 = pmul(y, xmm2);
|
|
316
|
+
xmm3 = pmul(y, xmm3);
|
|
317
|
+
x = padd(x, xmm1);
|
|
318
|
+
x = padd(x, xmm2);
|
|
319
|
+
x = padd(x, xmm3);
|
|
377
320
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
321
|
+
emm4 = psub(emm4, pset1<IntPacket>(2));
|
|
322
|
+
#if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
|
|
323
|
+
emm4 = pandnot(pset1<IntPacket>(4), emm4);
|
|
324
|
+
#else
|
|
325
|
+
emm4 = pandnot(emm4, pset1<IntPacket>(4));
|
|
326
|
+
#endif
|
|
327
|
+
emm4 = psll<29>(emm4);
|
|
328
|
+
Packet sign_bit_cos = reinterpret_to_float(emm4);
|
|
329
|
+
sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
|
|
383
330
|
|
|
384
|
-
EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b)
|
|
385
|
-
{
|
|
386
|
-
return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
|
|
387
|
-
}
|
|
388
331
|
|
|
332
|
+
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
333
|
+
Packet z = pmul(x, x);
|
|
334
|
+
y = pset1<Packet>(2.443315711809948E-005);
|
|
389
335
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
336
|
+
y = pmul(y, z);
|
|
337
|
+
y = padd(y, pset1<Packet>(-1.388731625493765E-003));
|
|
338
|
+
y = pmul(y, z);
|
|
339
|
+
y = padd(y, pset1<Packet>(4.166664568298827E-002));
|
|
340
|
+
y = pmul(y, z);
|
|
341
|
+
y = pmul(y, z);
|
|
342
|
+
Packet tmp = pmul(z, pset1<Packet>(0.5));
|
|
343
|
+
y = psub(y, tmp);
|
|
344
|
+
y = padd(y, pset1<Packet>(1));
|
|
399
345
|
|
|
400
|
-
|
|
401
|
-
EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
|
|
402
|
-
{
|
|
403
|
-
return _mm256_set1_epi64x(a);
|
|
404
|
-
}
|
|
346
|
+
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
405
347
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
split_two(b, b1, b2);
|
|
415
|
-
return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
|
|
416
|
-
#endif
|
|
417
|
-
}
|
|
348
|
+
Packet y2 = pset1<Packet>(-1.9515295891E-4);
|
|
349
|
+
y2 = pmul(y2, z);
|
|
350
|
+
y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
|
|
351
|
+
y2 = pmul(y2, z);
|
|
352
|
+
y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
|
|
353
|
+
y2 = pmul(y2, z);
|
|
354
|
+
y2 = pmul(y2, x);
|
|
355
|
+
y2 = padd(y2, x);
|
|
418
356
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
#
|
|
423
|
-
|
|
424
|
-
#else
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
}
|
|
357
|
+
/* select the correct result from the two polynoms */
|
|
358
|
+
xmm3 = poly_mask;
|
|
359
|
+
Packet ysin2 = pand(xmm3, y2);
|
|
360
|
+
#if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
|
|
361
|
+
Packet ysin1 = pandnot(y, xmm3);
|
|
362
|
+
#else
|
|
363
|
+
Packet ysin1 = pandnot(xmm3, y);
|
|
364
|
+
#endif
|
|
365
|
+
y2 = psub(y2, ysin2);
|
|
366
|
+
y = psub(y, ysin1);
|
|
430
367
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
{
|
|
434
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
435
|
-
return _mm256_srli_epi32(a, b);
|
|
436
|
-
#else
|
|
437
|
-
Packet4i a1, a2;
|
|
438
|
-
split_two(a, a1, a2);
|
|
439
|
-
return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
|
|
440
|
-
#endif
|
|
441
|
-
}
|
|
368
|
+
xmm1 = padd(ysin1, ysin2);
|
|
369
|
+
xmm2 = padd(y, y2);
|
|
442
370
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
447
|
-
return _mm256_slli_epi64(a, b);
|
|
448
|
-
#else
|
|
449
|
-
Packet4i a1, a2;
|
|
450
|
-
split_two(a, a1, a2);
|
|
451
|
-
return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
|
|
452
|
-
#endif
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
template<>
|
|
456
|
-
EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(const Packet8i& a, int b)
|
|
457
|
-
{
|
|
458
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
459
|
-
return _mm256_srli_epi64(a, b);
|
|
460
|
-
#else
|
|
461
|
-
Packet4i a1, a2;
|
|
462
|
-
split_two(a, a1, a2);
|
|
463
|
-
return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
|
|
464
|
-
#endif
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
468
|
-
{
|
|
469
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
470
|
-
return _mm256_add_epi32(a, b);
|
|
471
|
-
#else
|
|
472
|
-
Packet4i a1, a2, b1, b2;
|
|
473
|
-
split_two(a, a1, a2);
|
|
474
|
-
split_two(b, b1, b2);
|
|
475
|
-
return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
|
|
476
|
-
#endif
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
480
|
-
{
|
|
481
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
482
|
-
return _mm256_sub_epi32(a, b);
|
|
483
|
-
#else
|
|
484
|
-
Packet4i a1, a2, b1, b2;
|
|
485
|
-
split_two(a, a1, a2);
|
|
486
|
-
split_two(b, b1, b2);
|
|
487
|
-
return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
|
|
488
|
-
#endif
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
492
|
-
{
|
|
493
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
494
|
-
return _mm256_and_si256(a, b);
|
|
495
|
-
#else
|
|
496
|
-
return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
|
|
497
|
-
#endif
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
501
|
-
{
|
|
502
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
503
|
-
return _mm256_andnot_si256(a, b);
|
|
504
|
-
#else
|
|
505
|
-
return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
|
|
506
|
-
#endif
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
510
|
-
{
|
|
511
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
512
|
-
return _mm256_or_si256(a, b);
|
|
513
|
-
#else
|
|
514
|
-
return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
|
|
515
|
-
#endif
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
519
|
-
{
|
|
520
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
521
|
-
return _mm256_xor_si256(a, b);
|
|
522
|
-
#else
|
|
523
|
-
return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
|
|
524
|
-
#endif
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
template<>
|
|
528
|
-
EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
529
|
-
{
|
|
530
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
531
|
-
return _mm256_cmpgt_epi32(b, a);
|
|
532
|
-
#else
|
|
533
|
-
Packet4i a1, a2, b1, b2;
|
|
534
|
-
split_two(a, a1, a2);
|
|
535
|
-
split_two(b, b1, b2);
|
|
536
|
-
return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
|
|
537
|
-
#endif
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
template<>
|
|
541
|
-
EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(const Packet8f& a, const Packet8f& b)
|
|
542
|
-
{
|
|
543
|
-
return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
template<>
|
|
547
|
-
EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(const Packet8f& a, const Packet8f& b)
|
|
548
|
-
{
|
|
549
|
-
return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
template<>
|
|
553
|
-
EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(const Packet4d& a, const Packet4d& b)
|
|
554
|
-
{
|
|
555
|
-
return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
template<>
|
|
559
|
-
EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(const Packet4d& a, const Packet4d& b)
|
|
560
|
-
{
|
|
561
|
-
return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
template<>
|
|
565
|
-
EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
|
|
566
|
-
{
|
|
567
|
-
return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
template<>
|
|
571
|
-
EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
|
|
572
|
-
{
|
|
573
|
-
return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
template<>
|
|
577
|
-
EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket)
|
|
578
|
-
{
|
|
579
|
-
return _mm256_castps_si256(_mm256_blendv_ps(
|
|
580
|
-
_mm256_castsi256_ps(elsePacket),
|
|
581
|
-
_mm256_castsi256_ps(thenPacket),
|
|
582
|
-
_mm256_castsi256_ps(ifPacket)
|
|
583
|
-
));
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
template<>
|
|
587
|
-
EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
|
|
588
|
-
{
|
|
589
|
-
return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
|
|
590
|
-
}
|
|
591
|
-
|
|
592
|
-
template<>
|
|
593
|
-
EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
|
|
594
|
-
{
|
|
595
|
-
return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
|
|
596
|
-
}
|
|
597
|
-
|
|
598
|
-
template<>
|
|
599
|
-
EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(const int* addr, const Packet8i& index)
|
|
600
|
-
{
|
|
601
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
602
|
-
return _mm256_i32gather_epi32(addr, index, 4);
|
|
603
|
-
#else
|
|
604
|
-
uint32_t u[8];
|
|
605
|
-
_mm256_storeu_si256((Packet8i*)u, index);
|
|
606
|
-
return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
|
|
607
|
-
addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
|
|
608
|
-
#endif
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
template<>
|
|
612
|
-
EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(const float *addr, const Packet8i& index)
|
|
613
|
-
{
|
|
614
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
615
|
-
return _mm256_i32gather_ps(addr, index, 4);
|
|
616
|
-
#else
|
|
617
|
-
uint32_t u[8];
|
|
618
|
-
_mm256_storeu_si256((Packet8i*)u, index);
|
|
619
|
-
return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
|
|
620
|
-
addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
|
|
621
|
-
#endif
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
template<>
|
|
625
|
-
EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(const double *addr, const Packet8i& index, bool upperhalf)
|
|
626
|
-
{
|
|
627
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
628
|
-
return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
|
|
629
|
-
#else
|
|
630
|
-
uint32_t u[8];
|
|
631
|
-
_mm256_storeu_si256((Packet8i*)u, index);
|
|
632
|
-
if (upperhalf)
|
|
633
|
-
{
|
|
634
|
-
return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
|
|
635
|
-
}
|
|
636
|
-
else
|
|
637
|
-
{
|
|
638
|
-
return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
|
|
639
|
-
}
|
|
640
|
-
#endif
|
|
371
|
+
/* update the sign */
|
|
372
|
+
s = pxor(xmm1, sign_bit_sin);
|
|
373
|
+
c = pxor(xmm2, sign_bit_cos);
|
|
641
374
|
}
|
|
642
375
|
|
|
643
|
-
template
|
|
644
|
-
EIGEN_STRONG_INLINE
|
|
376
|
+
template<typename Packet>
|
|
377
|
+
EIGEN_STRONG_INLINE typename std::enable_if<
|
|
378
|
+
IsDoublePacket<Packet>::value
|
|
379
|
+
>::type psincos(Packet x, Packet& s, Packet& c)
|
|
645
380
|
{
|
|
646
|
-
|
|
647
|
-
|
|
381
|
+
Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
|
|
382
|
+
using IntPacket = decltype(reinterpret_to_int(x));
|
|
383
|
+
IntPacket emm0, emm2, emm4;
|
|
648
384
|
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
385
|
+
sign_bit_sin = x;
|
|
386
|
+
/* take the absolute value */
|
|
387
|
+
x = pabs(x);
|
|
388
|
+
/* extract the sign bit (upper one) */
|
|
389
|
+
sign_bit_sin = pext_sign(sign_bit_sin);
|
|
654
390
|
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
{
|
|
658
|
-
return pmovemask(_mm256_castsi256_ps(a));
|
|
659
|
-
}
|
|
391
|
+
/* scale by 4/Pi */
|
|
392
|
+
y = pmul(x, pset1<Packet>(1.27323954473516));
|
|
660
393
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
{
|
|
664
|
-
return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
|
|
665
|
-
}
|
|
394
|
+
/* store the integer part of y in emm2 */
|
|
395
|
+
emm2 = pcast64<Packet, IntPacket>(y);
|
|
666
396
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
}
|
|
397
|
+
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
398
|
+
emm2 = padd64(emm2, pseti64<IntPacket>(1));
|
|
399
|
+
emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
|
|
400
|
+
y = pcast64<IntPacket, Packet>(emm2);
|
|
672
401
|
|
|
673
|
-
|
|
674
|
-
EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(const Packet8i& a, const Packet8i& b)
|
|
675
|
-
{
|
|
676
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
677
|
-
return _mm256_cmpeq_epi64(a, b);
|
|
678
|
-
#else
|
|
679
|
-
Packet4i a1, a2, b1, b2;
|
|
680
|
-
split_two(a, a1, a2);
|
|
681
|
-
split_two(b, b1, b2);
|
|
682
|
-
return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
|
|
683
|
-
#endif
|
|
684
|
-
}
|
|
402
|
+
emm4 = emm2;
|
|
685
403
|
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
_mm256_storeu_si256((__m256i*)u, a);
|
|
691
|
-
u[0] = u[0] * b + c;
|
|
692
|
-
u[1] = u[1] * b + c;
|
|
693
|
-
u[2] = u[2] * b + c;
|
|
694
|
-
u[3] = u[3] * b + c;
|
|
695
|
-
return _mm256_loadu_si256((__m256i*)u);
|
|
696
|
-
}
|
|
697
|
-
}
|
|
698
|
-
}
|
|
699
|
-
#endif
|
|
404
|
+
/* get the swap sign flag for the sine */
|
|
405
|
+
emm0 = pand(emm2, pseti64<IntPacket>(4));
|
|
406
|
+
emm0 = psll64<61>(emm0);
|
|
407
|
+
Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
|
|
700
408
|
|
|
701
|
-
|
|
702
|
-
|
|
409
|
+
/* get the polynom selection mask for the sine*/
|
|
410
|
+
emm2 = pand(emm2, pseti64<IntPacket>(2));
|
|
703
411
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
namespace internal
|
|
707
|
-
{
|
|
708
|
-
template<>
|
|
709
|
-
struct reinterpreter<Packet4i>
|
|
710
|
-
{
|
|
711
|
-
EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x)
|
|
712
|
-
{
|
|
713
|
-
return _mm_castsi128_ps(x);
|
|
714
|
-
}
|
|
412
|
+
emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
|
|
413
|
+
Packet poly_mask = reinterpret_to_double(emm2);
|
|
715
414
|
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
415
|
+
/* The magic pass: "Extended precision modular arithmetic"
|
|
416
|
+
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
417
|
+
xmm1 = pset1<Packet>(-0.78515625);
|
|
418
|
+
xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
|
|
419
|
+
xmm3 = pset1<Packet>(-3.77489497744594108e-8);
|
|
420
|
+
xmm1 = pmul(y, xmm1);
|
|
421
|
+
xmm2 = pmul(y, xmm2);
|
|
422
|
+
xmm3 = pmul(y, xmm3);
|
|
423
|
+
x = padd(x, xmm1);
|
|
424
|
+
x = padd(x, xmm2);
|
|
425
|
+
x = padd(x, xmm3);
|
|
720
426
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
427
|
+
emm4 = psub64(emm4, pseti64<IntPacket>(2));
|
|
428
|
+
#if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
|
|
429
|
+
emm4 = pandnot(pseti64<IntPacket>(4), emm4);
|
|
430
|
+
#else
|
|
431
|
+
emm4 = pandnot(emm4, pseti64<IntPacket>(4));
|
|
432
|
+
#endif
|
|
433
|
+
emm4 = psll64<61>(emm4);
|
|
434
|
+
Packet sign_bit_cos = reinterpret_to_double(emm4);
|
|
435
|
+
sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
|
|
726
436
|
|
|
727
|
-
template<>
|
|
728
|
-
struct reinterpreter<Packet4f>
|
|
729
|
-
{
|
|
730
|
-
EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x)
|
|
731
|
-
{
|
|
732
|
-
return x;
|
|
733
|
-
}
|
|
734
437
|
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
}
|
|
438
|
+
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
439
|
+
Packet z = pmul(x, x);
|
|
440
|
+
y = pset1<Packet>(2.443315711809948E-005);
|
|
739
441
|
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
442
|
+
y = pmul(y, z);
|
|
443
|
+
y = padd(y, pset1<Packet>(-1.388731625493765E-003));
|
|
444
|
+
y = pmul(y, z);
|
|
445
|
+
y = padd(y, pset1<Packet>(4.166664568298827E-002));
|
|
446
|
+
y = pmul(y, z);
|
|
447
|
+
y = pmul(y, z);
|
|
448
|
+
Packet tmp = pmul(z, pset1<Packet>(0.5));
|
|
449
|
+
y = psub(y, tmp);
|
|
450
|
+
y = padd(y, pset1<Packet>(1));
|
|
745
451
|
|
|
746
|
-
|
|
747
|
-
struct reinterpreter<Packet2d>
|
|
748
|
-
{
|
|
749
|
-
EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
|
|
750
|
-
{
|
|
751
|
-
return _mm_castpd_ps(x);
|
|
752
|
-
}
|
|
452
|
+
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
753
453
|
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
454
|
+
Packet y2 = pset1<Packet>(-1.9515295891E-4);
|
|
455
|
+
y2 = pmul(y2, z);
|
|
456
|
+
y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
|
|
457
|
+
y2 = pmul(y2, z);
|
|
458
|
+
y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
|
|
459
|
+
y2 = pmul(y2, z);
|
|
460
|
+
y2 = pmul(y2, x);
|
|
461
|
+
y2 = padd(y2, x);
|
|
758
462
|
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
463
|
+
/* select the correct result from the two polynoms */
|
|
464
|
+
xmm3 = poly_mask;
|
|
465
|
+
Packet ysin2 = pand(xmm3, y2);
|
|
466
|
+
#if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
|
|
467
|
+
Packet ysin1 = pandnot(y, xmm3);
|
|
468
|
+
#else
|
|
469
|
+
Packet ysin1 = pandnot(xmm3, y);
|
|
470
|
+
#endif
|
|
471
|
+
y2 = psub(y2, ysin2);
|
|
472
|
+
y = psub(y, ysin1);
|
|
764
473
|
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
#ifdef EIGEN_VECTORIZE_SSE4_1
|
|
768
|
-
a = _mm_extract_epi64(x, 0);
|
|
769
|
-
b = _mm_extract_epi64(x, 1);
|
|
770
|
-
#else
|
|
771
|
-
uint64_t u[2];
|
|
772
|
-
_mm_storeu_si128((__m128i*)u, x);
|
|
773
|
-
a = u[0];
|
|
774
|
-
b = u[1];
|
|
775
|
-
#endif
|
|
776
|
-
}
|
|
474
|
+
xmm1 = padd(ysin1, ysin2);
|
|
475
|
+
xmm2 = padd(y, y2);
|
|
777
476
|
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
|
|
782
|
-
sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
|
|
783
|
-
sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
|
|
784
|
-
return _mm_or_si128(sa, sb);
|
|
477
|
+
/* update the sign */
|
|
478
|
+
s = pxor(xmm1, sign_bit_sin);
|
|
479
|
+
c = pxor(xmm2, sign_bit_cos);
|
|
785
480
|
}
|
|
786
481
|
|
|
787
|
-
template
|
|
788
|
-
EIGEN_STRONG_INLINE
|
|
482
|
+
template<typename Packet>
|
|
483
|
+
EIGEN_STRONG_INLINE typename std::enable_if<
|
|
484
|
+
IsDoublePacket<Packet>::value, Packet
|
|
485
|
+
>::type _psin(Packet x)
|
|
789
486
|
{
|
|
790
|
-
|
|
791
|
-
|
|
487
|
+
Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
|
|
488
|
+
using IntPacket = decltype(reinterpret_to_int(x));
|
|
489
|
+
IntPacket emm0, emm2;
|
|
792
490
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
491
|
+
sign_bit_sin = x;
|
|
492
|
+
/* take the absolute value */
|
|
493
|
+
x = pabs(x);
|
|
494
|
+
/* extract the sign bit (upper one) */
|
|
495
|
+
sign_bit_sin = pext_sign(sign_bit_sin);
|
|
798
496
|
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
{
|
|
802
|
-
return _mm_slli_epi32(a, b);
|
|
803
|
-
}
|
|
497
|
+
/* scale by 4/Pi */
|
|
498
|
+
y = pmul(x, pset1<Packet>(1.27323954473516));
|
|
804
499
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
{
|
|
808
|
-
return _mm_srli_epi32(a, b);
|
|
809
|
-
}
|
|
500
|
+
/* store the integer part of y in emm2 */
|
|
501
|
+
emm2 = pcast64<Packet, IntPacket>(y);
|
|
810
502
|
|
|
503
|
+
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
504
|
+
emm2 = padd64(emm2, pseti64<IntPacket>(1));
|
|
505
|
+
emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
|
|
506
|
+
y = pcast64<IntPacket, Packet>(emm2);
|
|
811
507
|
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
}
|
|
508
|
+
/* get the swap sign flag for the sine */
|
|
509
|
+
emm0 = pand(emm2, pseti64<IntPacket>(4));
|
|
510
|
+
emm0 = psll64<61>(emm0);
|
|
511
|
+
Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
|
|
817
512
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
{
|
|
821
|
-
return _mm_srli_epi64(a, b);
|
|
822
|
-
}
|
|
513
|
+
/* get the polynom selection mask for the sine*/
|
|
514
|
+
emm2 = pand(emm2, pseti64<IntPacket>(2));
|
|
823
515
|
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
{
|
|
827
|
-
return _mm_cmplt_epi32(a, b);
|
|
828
|
-
}
|
|
516
|
+
emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
|
|
517
|
+
Packet poly_mask = reinterpret_to_double(emm2);
|
|
829
518
|
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
519
|
+
/* The magic pass: "Extended precision modular arithmetic"
|
|
520
|
+
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
521
|
+
xmm1 = pset1<Packet>(-0.78515625);
|
|
522
|
+
xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
|
|
523
|
+
xmm3 = pset1<Packet>(-3.77489497744594108e-8);
|
|
524
|
+
xmm1 = pmul(y, xmm1);
|
|
525
|
+
xmm2 = pmul(y, xmm2);
|
|
526
|
+
xmm3 = pmul(y, xmm3);
|
|
527
|
+
x = padd(x, xmm1);
|
|
528
|
+
x = padd(x, xmm2);
|
|
529
|
+
x = padd(x, xmm3);
|
|
835
530
|
|
|
836
|
-
|
|
837
|
-
EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
838
|
-
{
|
|
839
|
-
return _mm_cmple_ps(a, b);
|
|
840
|
-
}
|
|
531
|
+
sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
|
|
841
532
|
|
|
842
|
-
template<>
|
|
843
|
-
EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
|
|
844
|
-
{
|
|
845
|
-
return _mm_cmplt_pd(a, b);
|
|
846
|
-
}
|
|
847
533
|
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
return _mm_cmple_pd(a, b);
|
|
852
|
-
}
|
|
534
|
+
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
535
|
+
Packet z = pmul(x, x);
|
|
536
|
+
y = pset1<Packet>(2.443315711809948E-005);
|
|
853
537
|
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
538
|
+
y = pmul(y, z);
|
|
539
|
+
y = padd(y, pset1<Packet>(-1.388731625493765E-003));
|
|
540
|
+
y = pmul(y, z);
|
|
541
|
+
y = padd(y, pset1<Packet>(4.166664568298827E-002));
|
|
542
|
+
y = pmul(y, z);
|
|
543
|
+
y = pmul(y, z);
|
|
544
|
+
Packet tmp = pmul(z, pset1<Packet>(0.5));
|
|
545
|
+
y = psub(y, tmp);
|
|
546
|
+
y = padd(y, pset1<Packet>(1));
|
|
863
547
|
|
|
864
|
-
|
|
865
|
-
EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
|
|
866
|
-
{
|
|
867
|
-
return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
|
|
868
|
-
}
|
|
548
|
+
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
869
549
|
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
}
|
|
550
|
+
Packet y2 = pset1<Packet>(-1.9515295891E-4);
|
|
551
|
+
y2 = pmul(y2, z);
|
|
552
|
+
y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
|
|
553
|
+
y2 = pmul(y2, z);
|
|
554
|
+
y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
|
|
555
|
+
y2 = pmul(y2, z);
|
|
556
|
+
y2 = pmul(y2, x);
|
|
557
|
+
y2 = padd(y2, x);
|
|
879
558
|
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
#
|
|
884
|
-
|
|
885
|
-
#else
|
|
886
|
-
|
|
887
|
-
#endif
|
|
888
|
-
}
|
|
559
|
+
/* select the correct result from the two polynoms */
|
|
560
|
+
xmm3 = poly_mask;
|
|
561
|
+
Packet ysin2 = pand(xmm3, y2);
|
|
562
|
+
#if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
|
|
563
|
+
Packet ysin1 = pandnot(y, xmm3);
|
|
564
|
+
#else
|
|
565
|
+
Packet ysin1 = pandnot(xmm3, y);
|
|
566
|
+
#endif
|
|
889
567
|
|
|
568
|
+
xmm1 = padd(ysin1, ysin2);
|
|
890
569
|
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
{
|
|
894
|
-
return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
|
|
570
|
+
/* update the sign */
|
|
571
|
+
return pxor(xmm1, sign_bit_sin);
|
|
895
572
|
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
896
575
|
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
{
|
|
900
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
901
|
-
return _mm_i32gather_epi32(addr, index, 4);
|
|
902
|
-
#else
|
|
903
|
-
uint32_t u[4];
|
|
904
|
-
_mm_storeu_si128((__m128i*)u, index);
|
|
905
|
-
return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
|
|
576
|
+
#ifdef EIGEN_VECTORIZE_AVX
|
|
577
|
+
#include "arch/AVX/MorePacketMath.h"
|
|
906
578
|
#endif
|
|
907
|
-
}
|
|
908
579
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
{
|
|
912
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
913
|
-
return _mm_i32gather_ps(addr, index, 4);
|
|
914
|
-
#else
|
|
915
|
-
uint32_t u[4];
|
|
916
|
-
_mm_storeu_si128((__m128i*)u, index);
|
|
917
|
-
return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
|
|
580
|
+
#ifdef EIGEN_VECTORIZE_SSE2
|
|
581
|
+
#include "arch/SSE/MorePacketMath.h"
|
|
918
582
|
#endif
|
|
919
|
-
}
|
|
920
583
|
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
{
|
|
924
|
-
#ifdef EIGEN_VECTORIZE_AVX2
|
|
925
|
-
return _mm_i32gather_pd(addr, index, 8);
|
|
926
|
-
#else
|
|
927
|
-
uint32_t u[4];
|
|
928
|
-
_mm_storeu_si128((__m128i*)u, index);
|
|
929
|
-
if (upperhalf)
|
|
930
|
-
{
|
|
931
|
-
return _mm_setr_pd(addr[u[2]], addr[u[3]]);
|
|
932
|
-
}
|
|
933
|
-
else
|
|
934
|
-
{
|
|
935
|
-
return _mm_setr_pd(addr[u[0]], addr[u[1]]);
|
|
936
|
-
}
|
|
584
|
+
#ifdef EIGEN_VECTORIZE_NEON
|
|
585
|
+
#include "arch/NEON/MorePacketMath.h"
|
|
937
586
|
#endif
|
|
938
|
-
}
|
|
939
|
-
|
|
940
|
-
template<>
|
|
941
|
-
EIGEN_STRONG_INLINE int pmovemask<Packet4f>(const Packet4f& a)
|
|
942
|
-
{
|
|
943
|
-
return _mm_movemask_ps(a);
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
template<>
|
|
947
|
-
EIGEN_STRONG_INLINE int pmovemask<Packet2d>(const Packet2d& a)
|
|
948
|
-
{
|
|
949
|
-
return _mm_movemask_pd(a);
|
|
950
|
-
}
|
|
951
587
|
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
template<>
|
|
959
|
-
EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(const Packet4f& a)
|
|
588
|
+
namespace Eigen
|
|
589
|
+
{
|
|
590
|
+
namespace internal
|
|
591
|
+
{
|
|
592
|
+
template<int b, typename Packet>
|
|
593
|
+
EIGEN_STRONG_INLINE Packet psll(const Packet& a)
|
|
960
594
|
{
|
|
961
|
-
|
|
962
|
-
return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
|
|
963
|
-
#else
|
|
964
|
-
auto round = _MM_GET_ROUNDING_MODE();
|
|
965
|
-
_MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
|
|
966
|
-
auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
|
|
967
|
-
_MM_SET_ROUNDING_MODE(round);
|
|
968
|
-
return ret;
|
|
969
|
-
#endif
|
|
595
|
+
return BitShifter<Packet>{}.template sll<b>(a);
|
|
970
596
|
}
|
|
971
597
|
|
|
972
|
-
template
|
|
973
|
-
EIGEN_STRONG_INLINE
|
|
598
|
+
template<int _b, typename Packet>
|
|
599
|
+
EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b)
|
|
974
600
|
{
|
|
975
|
-
|
|
976
|
-
return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
|
|
977
|
-
#else
|
|
978
|
-
auto round = _MM_GET_ROUNDING_MODE();
|
|
979
|
-
_MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
|
|
980
|
-
auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
|
|
981
|
-
_MM_SET_ROUNDING_MODE(round);
|
|
982
|
-
return ret;
|
|
983
|
-
#endif
|
|
601
|
+
return BitShifter<Packet>{}.template srl<_b>(a, b);
|
|
984
602
|
}
|
|
985
603
|
|
|
986
|
-
template
|
|
987
|
-
EIGEN_STRONG_INLINE
|
|
604
|
+
template<int b, typename Packet>
|
|
605
|
+
EIGEN_STRONG_INLINE Packet psll64(const Packet& a)
|
|
988
606
|
{
|
|
989
|
-
|
|
990
|
-
return _mm_cmpeq_epi64(a, b);
|
|
991
|
-
#else
|
|
992
|
-
Packet4i c = _mm_cmpeq_epi32(a, b);
|
|
993
|
-
return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
994
|
-
#endif
|
|
607
|
+
return BitShifter<Packet>{}.template sll64<b>(a);
|
|
995
608
|
}
|
|
996
609
|
|
|
997
|
-
template
|
|
998
|
-
EIGEN_STRONG_INLINE
|
|
610
|
+
template<int b, typename Packet>
|
|
611
|
+
EIGEN_STRONG_INLINE Packet psrl64(const Packet& a)
|
|
999
612
|
{
|
|
1000
|
-
|
|
1001
|
-
_mm_storeu_si128((__m128i*)u, a);
|
|
1002
|
-
u[0] = u[0] * b + c;
|
|
1003
|
-
u[1] = u[1] * b + c;
|
|
1004
|
-
return _mm_loadu_si128((__m128i*)u);
|
|
613
|
+
return BitShifter<Packet>{}.template srl64<b>(a);
|
|
1005
614
|
}
|
|
1006
615
|
}
|
|
1007
616
|
}
|
|
1008
|
-
#endif
|
|
1009
617
|
|
|
1010
|
-
#endif
|
|
618
|
+
#endif
|