tomoto 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +8 -10
- data/ext/tomoto/extconf.rb +6 -2
- data/ext/tomoto/{ext.cpp → tomoto.cpp} +1 -1
- data/lib/tomoto/version.rb +1 -1
- data/lib/tomoto.rb +5 -1
- data/vendor/EigenRand/EigenRand/Core.h +10 -10
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
- data/vendor/EigenRand/EigenRand/EigenRand +11 -6
- data/vendor/EigenRand/EigenRand/Macro.h +13 -7
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
- data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
- data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
- data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
- data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
- data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
- data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
- data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
- data/vendor/EigenRand/EigenRand/doc.h +24 -12
- data/vendor/EigenRand/README.md +57 -4
- data/vendor/eigen/COPYING.APACHE +203 -0
- data/vendor/eigen/COPYING.BSD +1 -1
- data/vendor/eigen/COPYING.MINPACK +51 -52
- data/vendor/eigen/Eigen/Cholesky +0 -1
- data/vendor/eigen/Eigen/Core +112 -265
- data/vendor/eigen/Eigen/Eigenvalues +2 -3
- data/vendor/eigen/Eigen/Geometry +5 -8
- data/vendor/eigen/Eigen/Householder +0 -1
- data/vendor/eigen/Eigen/Jacobi +0 -1
- data/vendor/eigen/Eigen/KLUSupport +41 -0
- data/vendor/eigen/Eigen/LU +2 -5
- data/vendor/eigen/Eigen/OrderingMethods +0 -3
- data/vendor/eigen/Eigen/PaStiXSupport +1 -0
- data/vendor/eigen/Eigen/PardisoSupport +0 -0
- data/vendor/eigen/Eigen/QR +2 -3
- data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
- data/vendor/eigen/Eigen/SVD +0 -1
- data/vendor/eigen/Eigen/Sparse +0 -2
- data/vendor/eigen/Eigen/SparseCholesky +0 -8
- data/vendor/eigen/Eigen/SparseLU +4 -0
- data/vendor/eigen/Eigen/SparseQR +0 -1
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
- data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
- data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
- data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
- data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
- data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
- data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
- data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
- data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
- data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
- data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
- data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
- data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
- data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
- data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
- data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
- data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
- data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
- data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
- data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
- data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
- data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
- data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
- data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
- data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
- data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
- data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
- data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
- data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
- data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
- data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
- data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
- data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
- data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
- data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
- data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
- data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
- data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
- data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
- data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
- data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
- data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
- data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
- data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
- data/vendor/eigen/README.md +2 -0
- data/vendor/eigen/bench/btl/README +1 -1
- data/vendor/eigen/bench/tensors/README +6 -7
- data/vendor/eigen/ci/README.md +56 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
- data/vendor/eigen/unsupported/README.txt +1 -1
- data/vendor/tomotopy/README.kr.rst +21 -0
- data/vendor/tomotopy/README.rst +20 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +2 -1
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +2 -1
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +53 -2
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +16 -5
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +31 -1
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +7 -5
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
- data/vendor/tomotopy/src/Utils/exception.h +6 -0
- data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
- data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
- metadata +60 -14
- data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -10,26 +10,20 @@
|
|
10
10
|
#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
|
11
11
|
#define EIGEN_PACKET_MATH_ZVECTOR_H
|
12
12
|
|
13
|
-
#include <stdint.h>
|
14
|
-
|
15
13
|
namespace Eigen {
|
16
14
|
|
17
15
|
namespace internal {
|
18
16
|
|
19
17
|
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
20
|
-
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
18
|
+
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
|
21
19
|
#endif
|
22
20
|
|
23
21
|
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
24
22
|
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
25
23
|
#endif
|
26
24
|
|
27
|
-
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
28
|
-
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
29
|
-
#endif
|
30
|
-
|
31
25
|
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
32
|
-
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
26
|
+
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
33
27
|
#endif
|
34
28
|
|
35
29
|
typedef __vector int Packet4i;
|
@@ -41,21 +35,30 @@ typedef __vector double Packet2d;
|
|
41
35
|
typedef __vector unsigned long long Packet2ul;
|
42
36
|
typedef __vector long long Packet2l;
|
43
37
|
|
38
|
+
// Z14 has builtin support for float vectors
|
39
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
40
|
+
typedef __vector float Packet4f;
|
41
|
+
#else
|
44
42
|
typedef struct {
|
45
43
|
Packet2d v4f[2];
|
46
44
|
} Packet4f;
|
45
|
+
#endif
|
47
46
|
|
48
47
|
typedef union {
|
49
|
-
int32_t i[4];
|
50
|
-
uint32_t ui[4];
|
51
|
-
int64_t l[2];
|
52
|
-
uint64_t ul[2];
|
48
|
+
numext::int32_t i[4];
|
49
|
+
numext::uint32_t ui[4];
|
50
|
+
numext::int64_t l[2];
|
51
|
+
numext::uint64_t ul[2];
|
53
52
|
double d[2];
|
53
|
+
float f[4];
|
54
54
|
Packet4i v4i;
|
55
55
|
Packet4ui v4ui;
|
56
56
|
Packet2l v2l;
|
57
57
|
Packet2ul v2ul;
|
58
58
|
Packet2d v2d;
|
59
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
60
|
+
Packet4f v4f;
|
61
|
+
#endif
|
59
62
|
} Packet;
|
60
63
|
|
61
64
|
// We don't want to write the same code all the time, but we need to reuse the constants
|
@@ -80,15 +83,31 @@ typedef union {
|
|
80
83
|
Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
81
84
|
|
82
85
|
// These constants are endian-agnostic
|
83
|
-
|
86
|
+
static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
|
84
87
|
static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
|
85
88
|
|
86
89
|
static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
|
87
90
|
static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
|
88
91
|
static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
|
89
92
|
|
90
|
-
static Packet2d p2d_ONE = { 1.0, 1.0 };
|
91
|
-
static Packet2d p2d_ZERO_ = {
|
93
|
+
static Packet2d p2d_ONE = { 1.0, 1.0 };
|
94
|
+
static Packet2d p2d_ZERO_ = { numext::bit_cast<double>0x8000000000000000ull),
|
95
|
+
numext::bit_cast<double>0x8000000000000000ull) };
|
96
|
+
|
97
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
98
|
+
#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
|
99
|
+
Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
|
100
|
+
|
101
|
+
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
|
102
|
+
Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
103
|
+
|
104
|
+
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
|
105
|
+
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
|
106
|
+
|
107
|
+
static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
|
108
|
+
static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
|
109
|
+
static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
|
110
|
+
#endif
|
92
111
|
|
93
112
|
static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
|
94
113
|
static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
|
@@ -120,9 +139,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0
|
|
120
139
|
static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
121
140
|
static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
|
122
141
|
|
123
|
-
|
142
|
+
static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
|
124
143
|
|
125
|
-
|
144
|
+
static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
126
145
|
|
127
146
|
|
128
147
|
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
@@ -149,29 +168,31 @@ template<> struct packet_traits<int> : default_packet_traits
|
|
149
168
|
};
|
150
169
|
};
|
151
170
|
|
152
|
-
template<>
|
153
|
-
{
|
171
|
+
template <>
|
172
|
+
struct packet_traits<float> : default_packet_traits {
|
154
173
|
typedef Packet4f type;
|
155
174
|
typedef Packet4f half;
|
156
175
|
enum {
|
157
176
|
Vectorizable = 1,
|
158
177
|
AlignedOnScalar = 1,
|
159
|
-
size=4,
|
178
|
+
size = 4,
|
160
179
|
HasHalfPacket = 0,
|
161
180
|
|
162
|
-
HasAdd
|
163
|
-
HasSub
|
164
|
-
HasMul
|
165
|
-
HasDiv
|
166
|
-
HasMin
|
167
|
-
HasMax
|
168
|
-
HasAbs
|
169
|
-
HasSin
|
170
|
-
HasCos
|
171
|
-
HasLog
|
172
|
-
HasExp
|
181
|
+
HasAdd = 1,
|
182
|
+
HasSub = 1,
|
183
|
+
HasMul = 1,
|
184
|
+
HasDiv = 1,
|
185
|
+
HasMin = 1,
|
186
|
+
HasMax = 1,
|
187
|
+
HasAbs = 1,
|
188
|
+
HasSin = 0,
|
189
|
+
HasCos = 0,
|
190
|
+
HasLog = 0,
|
191
|
+
HasExp = 1,
|
173
192
|
HasSqrt = 1,
|
174
193
|
HasRsqrt = 1,
|
194
|
+
HasTanh = 1,
|
195
|
+
HasErf = 1,
|
175
196
|
HasRound = 1,
|
176
197
|
HasFloor = 1,
|
177
198
|
HasCeil = 1,
|
@@ -211,9 +232,9 @@ template<> struct packet_traits<double> : default_packet_traits
|
|
211
232
|
};
|
212
233
|
};
|
213
234
|
|
214
|
-
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
|
215
|
-
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
|
216
|
-
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
|
235
|
+
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
|
236
|
+
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
|
237
|
+
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
|
217
238
|
|
218
239
|
/* Forward declaration */
|
219
240
|
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
|
@@ -258,82 +279,15 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
|
|
258
279
|
return s;
|
259
280
|
}
|
260
281
|
|
261
|
-
|
262
|
-
|
263
|
-
template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
|
282
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
283
|
+
inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
|
264
284
|
{
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
splat.v4f[1] = splat.v4f[0];
|
270
|
-
break;
|
271
|
-
case 1:
|
272
|
-
splat.v4f[0] = vec_splat(from.v4f[0], 1);
|
273
|
-
splat.v4f[1] = splat.v4f[0];
|
274
|
-
break;
|
275
|
-
case 2:
|
276
|
-
splat.v4f[0] = vec_splat(from.v4f[1], 0);
|
277
|
-
splat.v4f[1] = splat.v4f[0];
|
278
|
-
break;
|
279
|
-
case 3:
|
280
|
-
splat.v4f[0] = vec_splat(from.v4f[1], 1);
|
281
|
-
splat.v4f[1] = splat.v4f[0];
|
282
|
-
break;
|
283
|
-
}
|
284
|
-
return splat;
|
285
|
+
Packet vt;
|
286
|
+
vt.v4f = v;
|
287
|
+
s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
|
288
|
+
return s;
|
285
289
|
}
|
286
|
-
|
287
|
-
template<int Offset>
|
288
|
-
struct palign_impl<Offset,Packet4i>
|
289
|
-
{
|
290
|
-
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
|
291
|
-
{
|
292
|
-
switch (Offset % 4) {
|
293
|
-
case 1:
|
294
|
-
first = vec_sld(first, second, 4); break;
|
295
|
-
case 2:
|
296
|
-
first = vec_sld(first, second, 8); break;
|
297
|
-
case 3:
|
298
|
-
first = vec_sld(first, second, 12); break;
|
299
|
-
}
|
300
|
-
}
|
301
|
-
};
|
302
|
-
|
303
|
-
/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
|
304
|
-
*/
|
305
|
-
template<int Offset>
|
306
|
-
struct palign_impl<Offset,Packet4f>
|
307
|
-
{
|
308
|
-
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
|
309
|
-
{
|
310
|
-
switch (Offset % 4) {
|
311
|
-
case 1:
|
312
|
-
first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
|
313
|
-
first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
|
314
|
-
break;
|
315
|
-
case 2:
|
316
|
-
first.v4f[0] = first.v4f[1];
|
317
|
-
first.v4f[1] = second.v4f[0];
|
318
|
-
break;
|
319
|
-
case 3:
|
320
|
-
first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
|
321
|
-
first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
|
322
|
-
break;
|
323
|
-
}
|
324
|
-
}
|
325
|
-
};
|
326
|
-
|
327
|
-
|
328
|
-
template<int Offset>
|
329
|
-
struct palign_impl<Offset,Packet2d>
|
330
|
-
{
|
331
|
-
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
|
332
|
-
{
|
333
|
-
if (Offset == 1)
|
334
|
-
first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
|
335
|
-
}
|
336
|
-
};
|
290
|
+
#endif
|
337
291
|
|
338
292
|
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
|
339
293
|
{
|
@@ -344,16 +298,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
|
|
344
298
|
return vfrom->v4i;
|
345
299
|
}
|
346
300
|
|
347
|
-
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
348
|
-
{
|
349
|
-
// FIXME: No intrinsic yet
|
350
|
-
EIGEN_DEBUG_ALIGNED_LOAD
|
351
|
-
Packet4f vfrom;
|
352
|
-
vfrom.v4f[0] = vec_ld2f(&from[0]);
|
353
|
-
vfrom.v4f[1] = vec_ld2f(&from[2]);
|
354
|
-
return vfrom;
|
355
|
-
}
|
356
|
-
|
357
301
|
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
|
358
302
|
{
|
359
303
|
// FIXME: No intrinsic yet
|
@@ -372,15 +316,6 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f
|
|
372
316
|
vto->v4i = from;
|
373
317
|
}
|
374
318
|
|
375
|
-
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
|
376
|
-
{
|
377
|
-
// FIXME: No intrinsic yet
|
378
|
-
EIGEN_DEBUG_ALIGNED_STORE
|
379
|
-
vec_st2f(from.v4f[0], &to[0]);
|
380
|
-
vec_st2f(from.v4f[1], &to[2]);
|
381
|
-
}
|
382
|
-
|
383
|
-
|
384
319
|
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
|
385
320
|
{
|
386
321
|
// FIXME: No intrinsic yet
|
@@ -397,13 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
|
|
397
332
|
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
398
333
|
return vec_splats(from);
|
399
334
|
}
|
400
|
-
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
|
401
|
-
{
|
402
|
-
Packet4f to;
|
403
|
-
to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
|
404
|
-
to.v4f[1] = to.v4f[0];
|
405
|
-
return to;
|
406
|
-
}
|
407
335
|
|
408
336
|
template<> EIGEN_STRONG_INLINE void
|
409
337
|
pbroadcast4<Packet4i>(const int *a,
|
@@ -416,17 +344,6 @@ pbroadcast4<Packet4i>(const int *a,
|
|
416
344
|
a3 = vec_splat(a3, 3);
|
417
345
|
}
|
418
346
|
|
419
|
-
template<> EIGEN_STRONG_INLINE void
|
420
|
-
pbroadcast4<Packet4f>(const float *a,
|
421
|
-
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
422
|
-
{
|
423
|
-
a3 = pload<Packet4f>(a);
|
424
|
-
a0 = vec_splat_packet4f<0>(a3);
|
425
|
-
a1 = vec_splat_packet4f<1>(a3);
|
426
|
-
a2 = vec_splat_packet4f<2>(a3);
|
427
|
-
a3 = vec_splat_packet4f<3>(a3);
|
428
|
-
}
|
429
|
-
|
430
347
|
template<> EIGEN_STRONG_INLINE void
|
431
348
|
pbroadcast4<Packet2d>(const double *a,
|
432
349
|
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
|
@@ -449,16 +366,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
|
|
449
366
|
return pload<Packet4i>(ai);
|
450
367
|
}
|
451
368
|
|
452
|
-
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
453
|
-
{
|
454
|
-
float EIGEN_ALIGN16 ai[4];
|
455
|
-
ai[0] = from[0*stride];
|
456
|
-
ai[1] = from[1*stride];
|
457
|
-
ai[2] = from[2*stride];
|
458
|
-
ai[3] = from[3*stride];
|
459
|
-
return pload<Packet4f>(ai);
|
460
|
-
}
|
461
|
-
|
462
369
|
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
|
463
370
|
{
|
464
371
|
double EIGEN_ALIGN16 af[2];
|
@@ -477,16 +384,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
|
|
477
384
|
to[3*stride] = ai[3];
|
478
385
|
}
|
479
386
|
|
480
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
481
|
-
{
|
482
|
-
float EIGEN_ALIGN16 ai[4];
|
483
|
-
pstore<float>((float *)ai, from);
|
484
|
-
to[0*stride] = ai[0];
|
485
|
-
to[1*stride] = ai[1];
|
486
|
-
to[2*stride] = ai[2];
|
487
|
-
to[3*stride] = ai[3];
|
488
|
-
}
|
489
|
-
|
490
387
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
|
491
388
|
{
|
492
389
|
double EIGEN_ALIGN16 af[2];
|
@@ -496,160 +393,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
|
|
496
393
|
}
|
497
394
|
|
498
395
|
template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
|
499
|
-
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
|
500
|
-
{
|
501
|
-
Packet4f c;
|
502
|
-
c.v4f[0] = a.v4f[0] + b.v4f[0];
|
503
|
-
c.v4f[1] = a.v4f[1] + b.v4f[1];
|
504
|
-
return c;
|
505
|
-
}
|
506
396
|
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
|
507
397
|
|
508
398
|
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
|
509
|
-
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
|
510
|
-
{
|
511
|
-
Packet4f c;
|
512
|
-
c.v4f[0] = a.v4f[0] - b.v4f[0];
|
513
|
-
c.v4f[1] = a.v4f[1] - b.v4f[1];
|
514
|
-
return c;
|
515
|
-
}
|
516
399
|
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
|
517
400
|
|
518
401
|
template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
|
519
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
|
520
|
-
{
|
521
|
-
Packet4f c;
|
522
|
-
c.v4f[0] = a.v4f[0] * b.v4f[0];
|
523
|
-
c.v4f[1] = a.v4f[1] * b.v4f[1];
|
524
|
-
return c;
|
525
|
-
}
|
526
402
|
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
|
527
403
|
|
528
404
|
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
|
529
|
-
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
530
|
-
{
|
531
|
-
Packet4f c;
|
532
|
-
c.v4f[0] = a.v4f[0] / b.v4f[0];
|
533
|
-
c.v4f[1] = a.v4f[1] / b.v4f[1];
|
534
|
-
return c;
|
535
|
-
}
|
536
405
|
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
|
537
406
|
|
538
407
|
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
|
539
|
-
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
|
540
|
-
{
|
541
|
-
Packet4f c;
|
542
|
-
c.v4f[0] = -a.v4f[0];
|
543
|
-
c.v4f[1] = -a.v4f[1];
|
544
|
-
return c;
|
545
|
-
}
|
546
408
|
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
|
547
409
|
|
548
410
|
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
|
549
|
-
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
|
550
411
|
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
|
551
412
|
|
552
413
|
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
|
553
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
554
|
-
{
|
555
|
-
Packet4f res;
|
556
|
-
res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
|
557
|
-
res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
|
558
|
-
return res;
|
559
|
-
}
|
560
414
|
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
|
561
415
|
|
562
416
|
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
|
563
|
-
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
|
564
417
|
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
|
565
418
|
|
566
419
|
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
|
567
420
|
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
|
568
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
|
569
|
-
{
|
570
|
-
Packet4f res;
|
571
|
-
res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
|
572
|
-
res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
|
573
|
-
return res;
|
574
|
-
}
|
575
421
|
|
576
422
|
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
|
577
423
|
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
|
578
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
|
579
|
-
{
|
580
|
-
Packet4f res;
|
581
|
-
res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
|
582
|
-
res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
|
583
|
-
return res;
|
584
|
-
}
|
585
424
|
|
586
425
|
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
|
587
426
|
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
|
588
|
-
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
|
589
|
-
{
|
590
|
-
Packet4f res;
|
591
|
-
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
|
592
|
-
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
|
593
|
-
return res;
|
594
|
-
}
|
595
427
|
|
596
428
|
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
|
597
429
|
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
|
598
|
-
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
|
599
|
-
{
|
600
|
-
Packet4f res;
|
601
|
-
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
|
602
|
-
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
|
603
|
-
return res;
|
604
|
-
}
|
605
430
|
|
606
431
|
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
|
607
432
|
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
|
608
|
-
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
|
609
|
-
{
|
610
|
-
Packet4f res;
|
611
|
-
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
|
612
|
-
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
|
613
|
-
return res;
|
614
|
-
}
|
615
433
|
|
616
434
|
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
|
617
435
|
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
|
618
|
-
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
|
619
|
-
{
|
620
|
-
Packet4f res;
|
621
|
-
res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
|
622
|
-
res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
|
623
|
-
return res;
|
624
|
-
}
|
625
436
|
|
626
|
-
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
|
627
|
-
{
|
628
|
-
Packet4f res;
|
629
|
-
res.v4f[0] = vec_round(a.v4f[0]);
|
630
|
-
res.v4f[1] = vec_round(a.v4f[1]);
|
631
|
-
return res;
|
632
|
-
}
|
633
437
|
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
|
634
|
-
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
635
|
-
{
|
636
|
-
Packet4f res;
|
637
|
-
res.v4f[0] = vec_ceil(a.v4f[0]);
|
638
|
-
res.v4f[1] = vec_ceil(a.v4f[1]);
|
639
|
-
return res;
|
640
|
-
}
|
641
438
|
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
|
642
|
-
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
643
|
-
{
|
644
|
-
Packet4f res;
|
645
|
-
res.v4f[0] = vec_floor(a.v4f[0]);
|
646
|
-
res.v4f[1] = vec_floor(a.v4f[1]);
|
647
|
-
return res;
|
648
|
-
}
|
649
439
|
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
|
650
440
|
|
651
441
|
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
|
652
|
-
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); }
|
653
442
|
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
|
654
443
|
|
655
444
|
|
@@ -659,14 +448,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
|
659
448
|
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
660
449
|
}
|
661
450
|
|
662
|
-
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
663
|
-
{
|
664
|
-
Packet4f p = pload<Packet4f>(from);
|
665
|
-
p.v4f[1] = vec_splat(p.v4f[0], 1);
|
666
|
-
p.v4f[0] = vec_splat(p.v4f[0], 0);
|
667
|
-
return p;
|
668
|
-
}
|
669
|
-
|
670
451
|
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
671
452
|
{
|
672
453
|
Packet2d p = pload<Packet2d>(from);
|
@@ -674,15 +455,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
|
674
455
|
}
|
675
456
|
|
676
457
|
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
|
677
|
-
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
|
678
458
|
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
|
679
459
|
|
680
460
|
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
681
|
-
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
682
461
|
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
683
462
|
|
684
463
|
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
|
685
|
-
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
|
686
464
|
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
|
687
465
|
|
688
466
|
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
|
@@ -695,23 +473,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
|
|
695
473
|
return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
|
696
474
|
}
|
697
475
|
|
698
|
-
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
|
699
|
-
{
|
700
|
-
Packet4f rev;
|
701
|
-
rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
|
702
|
-
rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
|
703
|
-
return rev;
|
704
|
-
}
|
705
|
-
|
706
476
|
template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
|
707
477
|
template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
|
708
|
-
template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
|
709
|
-
{
|
710
|
-
Packet4f res;
|
711
|
-
res.v4f[0] = pabs(a.v4f[0]);
|
712
|
-
res.v4f[1] = pabs(a.v4f[1]);
|
713
|
-
return res;
|
714
|
-
}
|
715
478
|
|
716
479
|
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
|
717
480
|
{
|
@@ -730,71 +493,10 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
|
|
730
493
|
sum = padd<Packet2d>(a, b);
|
731
494
|
return pfirst(sum);
|
732
495
|
}
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
double first = predux<Packet2d>(sum);
|
738
|
-
return static_cast<float>(first);
|
739
|
-
}
|
740
|
-
|
741
|
-
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
|
742
|
-
{
|
743
|
-
Packet4i v[4], sum[4];
|
744
|
-
|
745
|
-
// It's easier and faster to transpose then add as columns
|
746
|
-
// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
|
747
|
-
// Do the transpose, first set of moves
|
748
|
-
v[0] = vec_mergeh(vecs[0], vecs[2]);
|
749
|
-
v[1] = vec_mergel(vecs[0], vecs[2]);
|
750
|
-
v[2] = vec_mergeh(vecs[1], vecs[3]);
|
751
|
-
v[3] = vec_mergel(vecs[1], vecs[3]);
|
752
|
-
// Get the resulting vectors
|
753
|
-
sum[0] = vec_mergeh(v[0], v[2]);
|
754
|
-
sum[1] = vec_mergel(v[0], v[2]);
|
755
|
-
sum[2] = vec_mergeh(v[1], v[3]);
|
756
|
-
sum[3] = vec_mergel(v[1], v[3]);
|
757
|
-
|
758
|
-
// Now do the summation:
|
759
|
-
// Lines 0+1
|
760
|
-
sum[0] = padd<Packet4i>(sum[0], sum[1]);
|
761
|
-
// Lines 2+3
|
762
|
-
sum[1] = padd<Packet4i>(sum[2], sum[3]);
|
763
|
-
// Add the results
|
764
|
-
sum[0] = padd<Packet4i>(sum[0], sum[1]);
|
765
|
-
|
766
|
-
return sum[0];
|
767
|
-
}
|
768
|
-
|
769
|
-
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
770
|
-
{
|
771
|
-
Packet2d v[2], sum;
|
772
|
-
v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
|
773
|
-
v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
|
774
|
-
|
775
|
-
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
|
776
|
-
|
777
|
-
return sum;
|
778
|
-
}
|
779
|
-
|
780
|
-
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
|
781
|
-
{
|
782
|
-
PacketBlock<Packet4f,4> transpose;
|
783
|
-
transpose.packet[0] = vecs[0];
|
784
|
-
transpose.packet[1] = vecs[1];
|
785
|
-
transpose.packet[2] = vecs[2];
|
786
|
-
transpose.packet[3] = vecs[3];
|
787
|
-
ptranspose(transpose);
|
788
|
-
|
789
|
-
Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
|
790
|
-
sum = padd(sum, transpose.packet[2]);
|
791
|
-
sum = padd(sum, transpose.packet[3]);
|
792
|
-
return sum;
|
793
|
-
}
|
794
|
-
|
795
|
-
// Other reduction functions:
|
796
|
-
// mul
|
797
|
-
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
496
|
+
|
497
|
+
// Other reduction functions:
|
498
|
+
// mul
|
499
|
+
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
798
500
|
{
|
799
501
|
EIGEN_ALIGN16 int aux[4];
|
800
502
|
pstore(aux, a);
|
@@ -806,12 +508,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
|
|
806
508
|
return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
807
509
|
}
|
808
510
|
|
809
|
-
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
810
|
-
{
|
811
|
-
// Return predux_mul<Packet2d> of the subvectors product
|
812
|
-
return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
|
813
|
-
}
|
814
|
-
|
815
511
|
// min
|
816
512
|
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
817
513
|
{
|
@@ -826,14 +522,6 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
|
|
826
522
|
return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
827
523
|
}
|
828
524
|
|
829
|
-
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
830
|
-
{
|
831
|
-
Packet2d b, res;
|
832
|
-
b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
|
833
|
-
res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
834
|
-
return static_cast<float>(pfirst(res));
|
835
|
-
}
|
836
|
-
|
837
525
|
// max
|
838
526
|
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
839
527
|
{
|
@@ -849,14 +537,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
|
849
537
|
return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
850
538
|
}
|
851
539
|
|
852
|
-
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
853
|
-
{
|
854
|
-
Packet2d b, res;
|
855
|
-
b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
|
856
|
-
res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
857
|
-
return static_cast<float>(pfirst(res));
|
858
|
-
}
|
859
|
-
|
860
540
|
EIGEN_DEVICE_FUNC inline void
|
861
541
|
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
862
542
|
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
@@ -877,6 +557,282 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
|
877
557
|
kernel.packet[1] = t1;
|
878
558
|
}
|
879
559
|
|
560
|
+
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
|
561
|
+
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
562
|
+
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
563
|
+
return vec_sel(elsePacket, thenPacket, mask);
|
564
|
+
}
|
565
|
+
|
566
|
+
|
567
|
+
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
|
568
|
+
Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
|
569
|
+
Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
|
570
|
+
return vec_sel(elsePacket, thenPacket, mask);
|
571
|
+
}
|
572
|
+
|
573
|
+
/* z13 has no vector float support so we emulate that with double
|
574
|
+
z14 has proper vector float support.
|
575
|
+
*/
|
576
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
|
577
|
+
/* Helper function to simulate a vec_splat_packet4f
|
578
|
+
*/
|
579
|
+
template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
|
580
|
+
{
|
581
|
+
Packet4f splat;
|
582
|
+
switch (element) {
|
583
|
+
case 0:
|
584
|
+
splat.v4f[0] = vec_splat(from.v4f[0], 0);
|
585
|
+
splat.v4f[1] = splat.v4f[0];
|
586
|
+
break;
|
587
|
+
case 1:
|
588
|
+
splat.v4f[0] = vec_splat(from.v4f[0], 1);
|
589
|
+
splat.v4f[1] = splat.v4f[0];
|
590
|
+
break;
|
591
|
+
case 2:
|
592
|
+
splat.v4f[0] = vec_splat(from.v4f[1], 0);
|
593
|
+
splat.v4f[1] = splat.v4f[0];
|
594
|
+
break;
|
595
|
+
case 3:
|
596
|
+
splat.v4f[0] = vec_splat(from.v4f[1], 1);
|
597
|
+
splat.v4f[1] = splat.v4f[0];
|
598
|
+
break;
|
599
|
+
}
|
600
|
+
return splat;
|
601
|
+
}
|
602
|
+
|
603
|
+
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
604
|
+
{
|
605
|
+
// FIXME: No intrinsic yet
|
606
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
607
|
+
Packet4f vfrom;
|
608
|
+
vfrom.v4f[0] = vec_ld2f(&from[0]);
|
609
|
+
vfrom.v4f[1] = vec_ld2f(&from[2]);
|
610
|
+
return vfrom;
|
611
|
+
}
|
612
|
+
|
613
|
+
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
|
614
|
+
{
|
615
|
+
// FIXME: No intrinsic yet
|
616
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
617
|
+
vec_st2f(from.v4f[0], &to[0]);
|
618
|
+
vec_st2f(from.v4f[1], &to[2]);
|
619
|
+
}
|
620
|
+
|
621
|
+
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
|
622
|
+
{
|
623
|
+
Packet4f to;
|
624
|
+
to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
|
625
|
+
to.v4f[1] = to.v4f[0];
|
626
|
+
return to;
|
627
|
+
}
|
628
|
+
|
629
|
+
template<> EIGEN_STRONG_INLINE void
|
630
|
+
pbroadcast4<Packet4f>(const float *a,
|
631
|
+
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
632
|
+
{
|
633
|
+
a3 = pload<Packet4f>(a);
|
634
|
+
a0 = vec_splat_packet4f<0>(a3);
|
635
|
+
a1 = vec_splat_packet4f<1>(a3);
|
636
|
+
a2 = vec_splat_packet4f<2>(a3);
|
637
|
+
a3 = vec_splat_packet4f<3>(a3);
|
638
|
+
}
|
639
|
+
|
640
|
+
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
641
|
+
{
|
642
|
+
float EIGEN_ALIGN16 ai[4];
|
643
|
+
ai[0] = from[0*stride];
|
644
|
+
ai[1] = from[1*stride];
|
645
|
+
ai[2] = from[2*stride];
|
646
|
+
ai[3] = from[3*stride];
|
647
|
+
return pload<Packet4f>(ai);
|
648
|
+
}
|
649
|
+
|
650
|
+
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
651
|
+
{
|
652
|
+
float EIGEN_ALIGN16 ai[4];
|
653
|
+
pstore<float>((float *)ai, from);
|
654
|
+
to[0*stride] = ai[0];
|
655
|
+
to[1*stride] = ai[1];
|
656
|
+
to[2*stride] = ai[2];
|
657
|
+
to[3*stride] = ai[3];
|
658
|
+
}
|
659
|
+
|
660
|
+
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
|
661
|
+
{
|
662
|
+
Packet4f c;
|
663
|
+
c.v4f[0] = a.v4f[0] + b.v4f[0];
|
664
|
+
c.v4f[1] = a.v4f[1] + b.v4f[1];
|
665
|
+
return c;
|
666
|
+
}
|
667
|
+
|
668
|
+
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
|
669
|
+
{
|
670
|
+
Packet4f c;
|
671
|
+
c.v4f[0] = a.v4f[0] - b.v4f[0];
|
672
|
+
c.v4f[1] = a.v4f[1] - b.v4f[1];
|
673
|
+
return c;
|
674
|
+
}
|
675
|
+
|
676
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
|
677
|
+
{
|
678
|
+
Packet4f c;
|
679
|
+
c.v4f[0] = a.v4f[0] * b.v4f[0];
|
680
|
+
c.v4f[1] = a.v4f[1] * b.v4f[1];
|
681
|
+
return c;
|
682
|
+
}
|
683
|
+
|
684
|
+
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
685
|
+
{
|
686
|
+
Packet4f c;
|
687
|
+
c.v4f[0] = a.v4f[0] / b.v4f[0];
|
688
|
+
c.v4f[1] = a.v4f[1] / b.v4f[1];
|
689
|
+
return c;
|
690
|
+
}
|
691
|
+
|
692
|
+
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
|
693
|
+
{
|
694
|
+
Packet4f c;
|
695
|
+
c.v4f[0] = -a.v4f[0];
|
696
|
+
c.v4f[1] = -a.v4f[1];
|
697
|
+
return c;
|
698
|
+
}
|
699
|
+
|
700
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
701
|
+
{
|
702
|
+
Packet4f res;
|
703
|
+
res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
|
704
|
+
res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
|
705
|
+
return res;
|
706
|
+
}
|
707
|
+
|
708
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
|
709
|
+
{
|
710
|
+
Packet4f res;
|
711
|
+
res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
|
712
|
+
res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
|
713
|
+
return res;
|
714
|
+
}
|
715
|
+
|
716
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
|
717
|
+
{
|
718
|
+
Packet4f res;
|
719
|
+
res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
|
720
|
+
res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
|
721
|
+
return res;
|
722
|
+
}
|
723
|
+
|
724
|
+
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
|
725
|
+
{
|
726
|
+
Packet4f res;
|
727
|
+
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
|
728
|
+
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
|
729
|
+
return res;
|
730
|
+
}
|
731
|
+
|
732
|
+
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
|
733
|
+
{
|
734
|
+
Packet4f res;
|
735
|
+
res.v4f[0] = por(a.v4f[0], b.v4f[0]);
|
736
|
+
res.v4f[1] = por(a.v4f[1], b.v4f[1]);
|
737
|
+
return res;
|
738
|
+
}
|
739
|
+
|
740
|
+
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
|
741
|
+
{
|
742
|
+
Packet4f res;
|
743
|
+
res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
|
744
|
+
res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
|
745
|
+
return res;
|
746
|
+
}
|
747
|
+
|
748
|
+
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
|
749
|
+
{
|
750
|
+
Packet4f res;
|
751
|
+
res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
|
752
|
+
res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
|
753
|
+
return res;
|
754
|
+
}
|
755
|
+
|
756
|
+
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
|
757
|
+
{
|
758
|
+
Packet4f res;
|
759
|
+
res.v4f[0] = vec_round(a.v4f[0]);
|
760
|
+
res.v4f[1] = vec_round(a.v4f[1]);
|
761
|
+
return res;
|
762
|
+
}
|
763
|
+
|
764
|
+
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
765
|
+
{
|
766
|
+
Packet4f res;
|
767
|
+
res.v4f[0] = vec_ceil(a.v4f[0]);
|
768
|
+
res.v4f[1] = vec_ceil(a.v4f[1]);
|
769
|
+
return res;
|
770
|
+
}
|
771
|
+
|
772
|
+
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
773
|
+
{
|
774
|
+
Packet4f res;
|
775
|
+
res.v4f[0] = vec_floor(a.v4f[0]);
|
776
|
+
res.v4f[1] = vec_floor(a.v4f[1]);
|
777
|
+
return res;
|
778
|
+
}
|
779
|
+
|
780
|
+
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
781
|
+
{
|
782
|
+
Packet4f p = pload<Packet4f>(from);
|
783
|
+
p.v4f[1] = vec_splat(p.v4f[0], 1);
|
784
|
+
p.v4f[0] = vec_splat(p.v4f[0], 0);
|
785
|
+
return p;
|
786
|
+
}
|
787
|
+
|
788
|
+
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
|
789
|
+
|
790
|
+
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
|
791
|
+
{
|
792
|
+
Packet4f rev;
|
793
|
+
rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
|
794
|
+
rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
|
795
|
+
return rev;
|
796
|
+
}
|
797
|
+
|
798
|
+
template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
|
799
|
+
{
|
800
|
+
Packet4f res;
|
801
|
+
res.v4f[0] = pabs(a.v4f[0]);
|
802
|
+
res.v4f[1] = pabs(a.v4f[1]);
|
803
|
+
return res;
|
804
|
+
}
|
805
|
+
|
806
|
+
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
|
807
|
+
{
|
808
|
+
Packet2d sum;
|
809
|
+
sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
|
810
|
+
double first = predux<Packet2d>(sum);
|
811
|
+
return static_cast<float>(first);
|
812
|
+
}
|
813
|
+
|
814
|
+
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
815
|
+
{
|
816
|
+
// Return predux_mul<Packet2d> of the subvectors product
|
817
|
+
return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
|
818
|
+
}
|
819
|
+
|
820
|
+
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
821
|
+
{
|
822
|
+
Packet2d b, res;
|
823
|
+
b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
|
824
|
+
res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
825
|
+
return static_cast<float>(pfirst(res));
|
826
|
+
}
|
827
|
+
|
828
|
+
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
829
|
+
{
|
830
|
+
Packet2d b, res;
|
831
|
+
b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
|
832
|
+
res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
833
|
+
return static_cast<float>(pfirst(res));
|
834
|
+
}
|
835
|
+
|
880
836
|
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
|
881
837
|
*/
|
882
838
|
EIGEN_DEVICE_FUNC inline void
|
@@ -915,12 +871,6 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
|
915
871
|
kernel.packet[3].v4f[1] = t3.packet[1];
|
916
872
|
}
|
917
873
|
|
918
|
-
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
|
919
|
-
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
920
|
-
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
921
|
-
return vec_sel(elsePacket, thenPacket, mask);
|
922
|
-
}
|
923
|
-
|
924
874
|
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
|
925
875
|
Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
|
926
876
|
Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
|
@@ -932,12 +882,177 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
|
|
932
882
|
return result;
|
933
883
|
}
|
934
884
|
|
935
|
-
template<> EIGEN_STRONG_INLINE
|
936
|
-
|
937
|
-
|
885
|
+
template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
|
886
|
+
{
|
887
|
+
Packet4f res;
|
888
|
+
res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
|
889
|
+
res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
|
890
|
+
return res;
|
891
|
+
}
|
892
|
+
|
893
|
+
template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
|
894
|
+
{
|
895
|
+
Packet4f res;
|
896
|
+
res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
|
897
|
+
res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
|
898
|
+
return res;
|
899
|
+
}
|
900
|
+
|
901
|
+
template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
|
902
|
+
{
|
903
|
+
Packet4f res;
|
904
|
+
res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
|
905
|
+
res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
|
906
|
+
return res;
|
907
|
+
}
|
908
|
+
|
909
|
+
#else
|
910
|
+
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
911
|
+
{
|
912
|
+
// FIXME: No intrinsic yet
|
913
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
914
|
+
Packet *vfrom;
|
915
|
+
vfrom = (Packet *) from;
|
916
|
+
return vfrom->v4f;
|
917
|
+
}
|
918
|
+
|
919
|
+
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
|
920
|
+
{
|
921
|
+
// FIXME: No intrinsic yet
|
922
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
923
|
+
Packet *vto;
|
924
|
+
vto = (Packet *) to;
|
925
|
+
vto->v4f = from;
|
926
|
+
}
|
927
|
+
|
928
|
+
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
|
929
|
+
{
|
930
|
+
return vec_splats(from);
|
931
|
+
}
|
932
|
+
|
933
|
+
template<> EIGEN_STRONG_INLINE void
|
934
|
+
pbroadcast4<Packet4f>(const float *a,
|
935
|
+
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
936
|
+
{
|
937
|
+
a3 = pload<Packet4f>(a);
|
938
|
+
a0 = vec_splat(a3, 0);
|
939
|
+
a1 = vec_splat(a3, 1);
|
940
|
+
a2 = vec_splat(a3, 2);
|
941
|
+
a3 = vec_splat(a3, 3);
|
942
|
+
}
|
943
|
+
|
944
|
+
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
945
|
+
{
|
946
|
+
float EIGEN_ALIGN16 af[4];
|
947
|
+
af[0] = from[0*stride];
|
948
|
+
af[1] = from[1*stride];
|
949
|
+
af[2] = from[2*stride];
|
950
|
+
af[3] = from[3*stride];
|
951
|
+
return pload<Packet4f>(af);
|
952
|
+
}
|
953
|
+
|
954
|
+
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
955
|
+
{
|
956
|
+
float EIGEN_ALIGN16 af[4];
|
957
|
+
pstore<float>((float*)af, from);
|
958
|
+
to[0*stride] = af[0];
|
959
|
+
to[1*stride] = af[1];
|
960
|
+
to[2*stride] = af[2];
|
961
|
+
to[3*stride] = af[3];
|
962
|
+
}
|
963
|
+
|
964
|
+
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
|
965
|
+
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
|
966
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
|
967
|
+
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
|
968
|
+
template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
|
969
|
+
template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f> (const Packet4f& a) { return a; }
|
970
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f> (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
|
971
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
|
972
|
+
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
|
973
|
+
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
|
974
|
+
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
|
975
|
+
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
|
976
|
+
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
|
977
|
+
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
|
978
|
+
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f> (const Packet4f& a) { return vec_ceil(a); }
|
979
|
+
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
|
980
|
+
template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f> (const Packet4f& a) { return vec_abs(a); }
|
981
|
+
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
|
982
|
+
|
983
|
+
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
984
|
+
{
|
985
|
+
Packet4f p = pload<Packet4f>(from);
|
986
|
+
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
987
|
+
}
|
988
|
+
|
989
|
+
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
|
990
|
+
{
|
991
|
+
return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
992
|
+
}
|
993
|
+
|
994
|
+
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
|
995
|
+
{
|
996
|
+
Packet4f b, sum;
|
997
|
+
b = vec_sld(a, a, 8);
|
998
|
+
sum = padd<Packet4f>(a, b);
|
999
|
+
b = vec_sld(sum, sum, 4);
|
1000
|
+
sum = padd<Packet4f>(sum, b);
|
1001
|
+
return pfirst(sum);
|
1002
|
+
}
|
1003
|
+
|
1004
|
+
// Other reduction functions:
|
1005
|
+
// mul
|
1006
|
+
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
1007
|
+
{
|
1008
|
+
Packet4f prod;
|
1009
|
+
prod = pmul(a, vec_sld(a, a, 8));
|
1010
|
+
return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
|
1011
|
+
}
|
1012
|
+
|
1013
|
+
// min
|
1014
|
+
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
1015
|
+
{
|
1016
|
+
Packet4f b, res;
|
1017
|
+
b = pmin<Packet4f>(a, vec_sld(a, a, 8));
|
1018
|
+
res = pmin<Packet4f>(b, vec_sld(b, b, 4));
|
1019
|
+
return pfirst(res);
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
// max
|
1023
|
+
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
1024
|
+
{
|
1025
|
+
Packet4f b, res;
|
1026
|
+
b = pmax<Packet4f>(a, vec_sld(a, a, 8));
|
1027
|
+
res = pmax<Packet4f>(b, vec_sld(b, b, 4));
|
1028
|
+
return pfirst(res);
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
EIGEN_DEVICE_FUNC inline void
|
1032
|
+
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
1033
|
+
Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
1034
|
+
Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
1035
|
+
Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
1036
|
+
Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
1037
|
+
kernel.packet[0] = vec_mergeh(t0, t2);
|
1038
|
+
kernel.packet[1] = vec_mergel(t0, t2);
|
1039
|
+
kernel.packet[2] = vec_mergeh(t1, t3);
|
1040
|
+
kernel.packet[3] = vec_mergel(t1, t3);
|
1041
|
+
}
|
1042
|
+
|
1043
|
+
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
|
1044
|
+
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
|
1045
|
+
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
938
1046
|
return vec_sel(elsePacket, thenPacket, mask);
|
939
1047
|
}
|
940
1048
|
|
1049
|
+
#endif
|
1050
|
+
|
1051
|
+
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
1052
|
+
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
|
1053
|
+
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
|
1054
|
+
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f> (const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
|
1055
|
+
|
941
1056
|
} // end namespace internal
|
942
1057
|
|
943
1058
|
} // end namespace Eigen
|