tomoto 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +123 -0
- data/ext/tomoto/ext.cpp +245 -0
- data/ext/tomoto/extconf.rb +28 -0
- data/lib/tomoto.rb +12 -0
- data/lib/tomoto/ct.rb +11 -0
- data/lib/tomoto/hdp.rb +11 -0
- data/lib/tomoto/lda.rb +67 -0
- data/lib/tomoto/version.rb +3 -0
- data/vendor/EigenRand/EigenRand/Core.h +1139 -0
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
- data/vendor/EigenRand/EigenRand/EigenRand +19 -0
- data/vendor/EigenRand/EigenRand/Macro.h +24 -0
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
- data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
- data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
- data/vendor/EigenRand/EigenRand/doc.h +220 -0
- data/vendor/EigenRand/LICENSE +21 -0
- data/vendor/EigenRand/README.md +288 -0
- data/vendor/eigen/COPYING.BSD +26 -0
- data/vendor/eigen/COPYING.GPL +674 -0
- data/vendor/eigen/COPYING.LGPL +502 -0
- data/vendor/eigen/COPYING.MINPACK +52 -0
- data/vendor/eigen/COPYING.MPL2 +373 -0
- data/vendor/eigen/COPYING.README +18 -0
- data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
- data/vendor/eigen/Eigen/Cholesky +46 -0
- data/vendor/eigen/Eigen/CholmodSupport +48 -0
- data/vendor/eigen/Eigen/Core +537 -0
- data/vendor/eigen/Eigen/Dense +7 -0
- data/vendor/eigen/Eigen/Eigen +2 -0
- data/vendor/eigen/Eigen/Eigenvalues +61 -0
- data/vendor/eigen/Eigen/Geometry +62 -0
- data/vendor/eigen/Eigen/Householder +30 -0
- data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
- data/vendor/eigen/Eigen/Jacobi +33 -0
- data/vendor/eigen/Eigen/LU +50 -0
- data/vendor/eigen/Eigen/MetisSupport +35 -0
- data/vendor/eigen/Eigen/OrderingMethods +73 -0
- data/vendor/eigen/Eigen/PaStiXSupport +48 -0
- data/vendor/eigen/Eigen/PardisoSupport +35 -0
- data/vendor/eigen/Eigen/QR +51 -0
- data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
- data/vendor/eigen/Eigen/SPQRSupport +34 -0
- data/vendor/eigen/Eigen/SVD +51 -0
- data/vendor/eigen/Eigen/Sparse +36 -0
- data/vendor/eigen/Eigen/SparseCholesky +45 -0
- data/vendor/eigen/Eigen/SparseCore +69 -0
- data/vendor/eigen/Eigen/SparseLU +46 -0
- data/vendor/eigen/Eigen/SparseQR +37 -0
- data/vendor/eigen/Eigen/StdDeque +27 -0
- data/vendor/eigen/Eigen/StdList +26 -0
- data/vendor/eigen/Eigen/StdVector +27 -0
- data/vendor/eigen/Eigen/SuperLUSupport +64 -0
- data/vendor/eigen/Eigen/UmfPackSupport +40 -0
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
- data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
- data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
- data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
- data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
- data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
- data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
- data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
- data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
- data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
- data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
- data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
- data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
- data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
- data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
- data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
- data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
- data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
- data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
- data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
- data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
- data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
- data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
- data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
- data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
- data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
- data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
- data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
- data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
- data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
- data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
- data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
- data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
- data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
- data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
- data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
- data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
- data/vendor/eigen/README.md +3 -0
- data/vendor/eigen/bench/README.txt +55 -0
- data/vendor/eigen/bench/btl/COPYING +340 -0
- data/vendor/eigen/bench/btl/README +154 -0
- data/vendor/eigen/bench/tensors/README +21 -0
- data/vendor/eigen/blas/README.txt +6 -0
- data/vendor/eigen/demos/mandelbrot/README +10 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
- data/vendor/eigen/demos/opengl/README +13 -0
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
- data/vendor/eigen/unsupported/README.txt +50 -0
- data/vendor/tomotopy/LICENSE +21 -0
- data/vendor/tomotopy/README.kr.rst +375 -0
- data/vendor/tomotopy/README.rst +382 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
- data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
- data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
- data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
- data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
- data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
- data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
- data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
- data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
- data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
- data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
- data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
- data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
- data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
- data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
- data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
- data/vendor/tomotopy/src/Utils/exception.h +28 -0
- data/vendor/tomotopy/src/Utils/math.h +281 -0
- data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
- data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
- data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
- data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
- data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
- data/vendor/tomotopy/src/Utils/text.hpp +49 -0
- data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
- metadata +531 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
/*
|
2
|
+
Copyright (c) 2011, Intel Corporation. All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
5
|
+
are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
8
|
+
list of conditions and the following disclaimer.
|
9
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
11
|
+
and/or other materials provided with the distribution.
|
12
|
+
* Neither the name of Intel Corporation nor the names of its contributors may
|
13
|
+
be used to endorse or promote products derived from this software without
|
14
|
+
specific prior written permission.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
17
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
18
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
20
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
21
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
22
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
23
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
|
27
|
+
********************************************************************************
|
28
|
+
* Content : Eigen bindings to BLAS F77
|
29
|
+
* General matrix-matrix product functionality based on ?GEMM.
|
30
|
+
********************************************************************************
|
31
|
+
*/
|
32
|
+
|
33
|
+
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
|
34
|
+
#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
|
35
|
+
|
36
|
+
namespace Eigen {
|
37
|
+
|
38
|
+
namespace internal {
|
39
|
+
|
40
|
+
/**********************************************************************
|
41
|
+
* This file implements general matrix-matrix multiplication using BLAS
|
42
|
+
* gemm function via partial specialization of
|
43
|
+
* general_matrix_matrix_product::run(..) method for float, double,
|
44
|
+
* std::complex<float> and std::complex<double> types
|
45
|
+
**********************************************************************/
|
46
|
+
|
47
|
+
// gemm specialization
|
48
|
+
|
49
|
+
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
|
50
|
+
template< \
|
51
|
+
typename Index, \
|
52
|
+
int LhsStorageOrder, bool ConjugateLhs, \
|
53
|
+
int RhsStorageOrder, bool ConjugateRhs> \
|
54
|
+
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
|
55
|
+
{ \
|
56
|
+
typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
|
57
|
+
\
|
58
|
+
static void run(Index rows, Index cols, Index depth, \
|
59
|
+
const EIGTYPE* _lhs, Index lhsStride, \
|
60
|
+
const EIGTYPE* _rhs, Index rhsStride, \
|
61
|
+
EIGTYPE* res, Index resStride, \
|
62
|
+
EIGTYPE alpha, \
|
63
|
+
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
|
64
|
+
GemmParallelInfo<Index>* /*info = 0*/) \
|
65
|
+
{ \
|
66
|
+
using std::conj; \
|
67
|
+
\
|
68
|
+
char transa, transb; \
|
69
|
+
BlasIndex m, n, k, lda, ldb, ldc; \
|
70
|
+
const EIGTYPE *a, *b; \
|
71
|
+
EIGTYPE beta(1); \
|
72
|
+
MatrixX##EIGPREFIX a_tmp, b_tmp; \
|
73
|
+
\
|
74
|
+
/* Set transpose options */ \
|
75
|
+
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
|
76
|
+
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
|
77
|
+
\
|
78
|
+
/* Set m, n, k */ \
|
79
|
+
m = convert_index<BlasIndex>(rows); \
|
80
|
+
n = convert_index<BlasIndex>(cols); \
|
81
|
+
k = convert_index<BlasIndex>(depth); \
|
82
|
+
\
|
83
|
+
/* Set lda, ldb, ldc */ \
|
84
|
+
lda = convert_index<BlasIndex>(lhsStride); \
|
85
|
+
ldb = convert_index<BlasIndex>(rhsStride); \
|
86
|
+
ldc = convert_index<BlasIndex>(resStride); \
|
87
|
+
\
|
88
|
+
/* Set a, b, c */ \
|
89
|
+
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
|
90
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
|
91
|
+
a_tmp = lhs.conjugate(); \
|
92
|
+
a = a_tmp.data(); \
|
93
|
+
lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
|
94
|
+
} else a = _lhs; \
|
95
|
+
\
|
96
|
+
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
|
97
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
|
98
|
+
b_tmp = rhs.conjugate(); \
|
99
|
+
b = b_tmp.data(); \
|
100
|
+
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
101
|
+
} else b = _rhs; \
|
102
|
+
\
|
103
|
+
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
104
|
+
}};
|
105
|
+
|
106
|
+
#ifdef EIGEN_USE_MKL
|
107
|
+
GEMM_SPECIALIZATION(double, d, double, dgemm)
|
108
|
+
GEMM_SPECIALIZATION(float, f, float, sgemm)
|
109
|
+
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
|
110
|
+
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
|
111
|
+
#else
|
112
|
+
GEMM_SPECIALIZATION(double, d, double, dgemm_)
|
113
|
+
GEMM_SPECIALIZATION(float, f, float, sgemm_)
|
114
|
+
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
|
115
|
+
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
|
116
|
+
#endif
|
117
|
+
|
118
|
+
} // end namespase internal
|
119
|
+
|
120
|
+
} // end namespace Eigen
|
121
|
+
|
122
|
+
#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
|
@@ -0,0 +1,619 @@
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
2
|
+
// for linear algebra.
|
3
|
+
//
|
4
|
+
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
|
5
|
+
//
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
9
|
+
|
10
|
+
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
11
|
+
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
12
|
+
|
13
|
+
namespace Eigen {
|
14
|
+
|
15
|
+
namespace internal {
|
16
|
+
|
17
|
+
/* Optimized col-major matrix * vector product:
|
18
|
+
* This algorithm processes 4 columns at onces that allows to both reduce
|
19
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
20
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
21
|
+
* same alignment pattern.
|
22
|
+
*
|
23
|
+
* Mixing type logic: C += alpha * A * B
|
24
|
+
* | A | B |alpha| comments
|
25
|
+
* |real |cplx |cplx | no vectorization
|
26
|
+
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
|
27
|
+
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
28
|
+
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
29
|
+
*
|
30
|
+
* Accesses to the matrix coefficients follow the following logic:
|
31
|
+
*
|
32
|
+
* - if all columns have the same alignment then
|
33
|
+
* - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
|
34
|
+
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
35
|
+
* - otherwise
|
36
|
+
* - if even columns have the same alignment then
|
37
|
+
* // odd columns are guaranteed to have the same alignment too
|
38
|
+
* - if even or odd columns have the same alignment as the result, then
|
39
|
+
* // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
|
40
|
+
* - perform half aligned and half unaligned loads (-> EvenAligned case)
|
41
|
+
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
42
|
+
* - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
|
43
|
+
* - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
|
44
|
+
* perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
|
45
|
+
* // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
|
46
|
+
* - otherwise,
|
47
|
+
* // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
|
48
|
+
* // we currently fall back to the NoneAligned case
|
49
|
+
*
|
50
|
+
* The same reasoning apply for the transposed case.
|
51
|
+
*
|
52
|
+
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
|
53
|
+
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
|
54
|
+
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
|
55
|
+
* compared to unaligned loads on a 4 byte boundary.
|
56
|
+
*
|
57
|
+
*/
|
58
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
59
|
+
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
60
|
+
{
|
61
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
62
|
+
|
63
|
+
enum {
|
64
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
65
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
66
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
67
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
68
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
69
|
+
};
|
70
|
+
|
71
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
72
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
73
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
74
|
+
|
75
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
76
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
77
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
78
|
+
|
79
|
+
EIGEN_DONT_INLINE static void run(
|
80
|
+
Index rows, Index cols,
|
81
|
+
const LhsMapper& lhs,
|
82
|
+
const RhsMapper& rhs,
|
83
|
+
ResScalar* res, Index resIncr,
|
84
|
+
RhsScalar alpha);
|
85
|
+
};
|
86
|
+
|
87
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
88
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
89
|
+
Index rows, Index cols,
|
90
|
+
const LhsMapper& lhs,
|
91
|
+
const RhsMapper& rhs,
|
92
|
+
ResScalar* res, Index resIncr,
|
93
|
+
RhsScalar alpha)
|
94
|
+
{
|
95
|
+
EIGEN_UNUSED_VARIABLE(resIncr);
|
96
|
+
eigen_internal_assert(resIncr==1);
|
97
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
98
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
99
|
+
#endif
|
100
|
+
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
|
101
|
+
pstore(&res[j], \
|
102
|
+
padd(pload<ResPacket>(&res[j]), \
|
103
|
+
padd( \
|
104
|
+
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
|
105
|
+
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
|
106
|
+
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
|
107
|
+
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
|
108
|
+
|
109
|
+
typedef typename LhsMapper::VectorMapper LhsScalars;
|
110
|
+
|
111
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
112
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
113
|
+
if(ConjugateRhs)
|
114
|
+
alpha = numext::conj(alpha);
|
115
|
+
|
116
|
+
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
117
|
+
const Index columnsAtOnce = 4;
|
118
|
+
const Index peels = 2;
|
119
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
120
|
+
const Index ResPacketAlignedMask = ResPacketSize-1;
|
121
|
+
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
122
|
+
const Index size = rows;
|
123
|
+
|
124
|
+
const Index lhsStride = lhs.stride();
|
125
|
+
|
126
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
127
|
+
// Here we assume data are at least aligned on the base scalar type.
|
128
|
+
Index alignedStart = internal::first_default_aligned(res,size);
|
129
|
+
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
|
130
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
131
|
+
|
132
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
133
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
134
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
135
|
+
: FirstAligned;
|
136
|
+
|
137
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
138
|
+
const Index lhsAlignmentOffset = lhs.firstAligned(size);
|
139
|
+
|
140
|
+
// find how many columns do we have to skip to be aligned with the result (if possible)
|
141
|
+
Index skipColumns = 0;
|
142
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
143
|
+
if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
|
144
|
+
{
|
145
|
+
alignedSize = 0;
|
146
|
+
alignedStart = 0;
|
147
|
+
alignmentPattern = NoneAligned;
|
148
|
+
}
|
149
|
+
else if(LhsPacketSize > 4)
|
150
|
+
{
|
151
|
+
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
152
|
+
// Currently, it seems to be better to perform unaligned loads anyway
|
153
|
+
alignmentPattern = NoneAligned;
|
154
|
+
}
|
155
|
+
else if (LhsPacketSize>1)
|
156
|
+
{
|
157
|
+
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
158
|
+
|
159
|
+
while (skipColumns<LhsPacketSize &&
|
160
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
161
|
+
++skipColumns;
|
162
|
+
if (skipColumns==LhsPacketSize)
|
163
|
+
{
|
164
|
+
// nothing can be aligned, no need to skip any column
|
165
|
+
alignmentPattern = NoneAligned;
|
166
|
+
skipColumns = 0;
|
167
|
+
}
|
168
|
+
else
|
169
|
+
{
|
170
|
+
skipColumns = (std::min)(skipColumns,cols);
|
171
|
+
// note that the skiped columns are processed later.
|
172
|
+
}
|
173
|
+
|
174
|
+
/* eigen_internal_assert( (alignmentPattern==NoneAligned)
|
175
|
+
|| (skipColumns + columnsAtOnce >= cols)
|
176
|
+
|| LhsPacketSize > size
|
177
|
+
|| (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
|
178
|
+
}
|
179
|
+
else if(Vectorizable)
|
180
|
+
{
|
181
|
+
alignedStart = 0;
|
182
|
+
alignedSize = size;
|
183
|
+
alignmentPattern = AllAligned;
|
184
|
+
}
|
185
|
+
|
186
|
+
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
187
|
+
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
188
|
+
|
189
|
+
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
190
|
+
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
191
|
+
{
|
192
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
|
193
|
+
ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
|
194
|
+
ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
|
195
|
+
ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
|
196
|
+
|
197
|
+
// this helps a lot generating better binary code
|
198
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
|
199
|
+
lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
|
200
|
+
|
201
|
+
if (Vectorizable)
|
202
|
+
{
|
203
|
+
/* explicit vectorization */
|
204
|
+
// process initial unaligned coeffs
|
205
|
+
for (Index j=0; j<alignedStart; ++j)
|
206
|
+
{
|
207
|
+
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
208
|
+
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
209
|
+
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
210
|
+
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
211
|
+
}
|
212
|
+
|
213
|
+
if (alignedSize>alignedStart)
|
214
|
+
{
|
215
|
+
switch(alignmentPattern)
|
216
|
+
{
|
217
|
+
case AllAligned:
|
218
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
219
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
220
|
+
break;
|
221
|
+
case EvenAligned:
|
222
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
223
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
224
|
+
break;
|
225
|
+
case FirstAligned:
|
226
|
+
{
|
227
|
+
Index j = alignedStart;
|
228
|
+
if(peels>1)
|
229
|
+
{
|
230
|
+
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
231
|
+
ResPacket T0, T1;
|
232
|
+
|
233
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
234
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
235
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
236
|
+
|
237
|
+
for (; j<peeledSize; j+=peels*ResPacketSize)
|
238
|
+
{
|
239
|
+
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
240
|
+
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
241
|
+
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
242
|
+
|
243
|
+
A00 = lhs0.template load<LhsPacket, Aligned>(j);
|
244
|
+
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
|
245
|
+
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
246
|
+
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
247
|
+
|
248
|
+
T0 = pcj.pmadd(A01, ptmp1, T0);
|
249
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
250
|
+
T0 = pcj.pmadd(A02, ptmp2, T0);
|
251
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
252
|
+
T0 = pcj.pmadd(A03, ptmp3, T0);
|
253
|
+
pstore(&res[j],T0);
|
254
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
255
|
+
T1 = pcj.pmadd(A11, ptmp1, T1);
|
256
|
+
T1 = pcj.pmadd(A12, ptmp2, T1);
|
257
|
+
T1 = pcj.pmadd(A13, ptmp3, T1);
|
258
|
+
pstore(&res[j+ResPacketSize],T1);
|
259
|
+
}
|
260
|
+
}
|
261
|
+
for (; j<alignedSize; j+=ResPacketSize)
|
262
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
263
|
+
break;
|
264
|
+
}
|
265
|
+
default:
|
266
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
267
|
+
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
268
|
+
break;
|
269
|
+
}
|
270
|
+
}
|
271
|
+
} // end explicit vectorization
|
272
|
+
|
273
|
+
/* process remaining coeffs (or all if there is no explicit vectorization) */
|
274
|
+
for (Index j=alignedSize; j<size; ++j)
|
275
|
+
{
|
276
|
+
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
277
|
+
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
278
|
+
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
279
|
+
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
280
|
+
}
|
281
|
+
}
|
282
|
+
|
283
|
+
// process remaining first and last columns (at most columnsAtOnce-1)
|
284
|
+
Index end = cols;
|
285
|
+
Index start = columnBound;
|
286
|
+
do
|
287
|
+
{
|
288
|
+
for (Index k=start; k<end; ++k)
|
289
|
+
{
|
290
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
|
291
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
|
292
|
+
|
293
|
+
if (Vectorizable)
|
294
|
+
{
|
295
|
+
/* explicit vectorization */
|
296
|
+
// process first unaligned result's coeffs
|
297
|
+
for (Index j=0; j<alignedStart; ++j)
|
298
|
+
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
|
299
|
+
// process aligned result's coeffs
|
300
|
+
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
301
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
302
|
+
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
303
|
+
else
|
304
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
305
|
+
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
306
|
+
}
|
307
|
+
|
308
|
+
// process remaining scalars (or all if no explicit vectorization)
|
309
|
+
for (Index i=alignedSize; i<size; ++i)
|
310
|
+
res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
|
311
|
+
}
|
312
|
+
if (skipColumns)
|
313
|
+
{
|
314
|
+
start = 0;
|
315
|
+
end = skipColumns;
|
316
|
+
skipColumns = 0;
|
317
|
+
}
|
318
|
+
else
|
319
|
+
break;
|
320
|
+
} while(Vectorizable);
|
321
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
322
|
+
}
|
323
|
+
|
324
|
+
/* Optimized row-major matrix * vector product:
|
325
|
+
* This algorithm processes 4 rows at onces that allows to both reduce
|
326
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
327
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
328
|
+
* same alignment pattern.
|
329
|
+
*
|
330
|
+
* Mixing type logic:
|
331
|
+
* - alpha is always a complex (or converted to a complex)
|
332
|
+
* - no vectorization
|
333
|
+
*/
|
334
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
335
|
+
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
336
|
+
{
|
337
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
338
|
+
|
339
|
+
enum {
|
340
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
341
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
342
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
343
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
344
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
345
|
+
};
|
346
|
+
|
347
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
348
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
349
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
350
|
+
|
351
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
352
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
353
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
354
|
+
|
355
|
+
EIGEN_DONT_INLINE static void run(
|
356
|
+
Index rows, Index cols,
|
357
|
+
const LhsMapper& lhs,
|
358
|
+
const RhsMapper& rhs,
|
359
|
+
ResScalar* res, Index resIncr,
|
360
|
+
ResScalar alpha);
|
361
|
+
};
|
362
|
+
|
363
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
364
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
365
|
+
Index rows, Index cols,
|
366
|
+
const LhsMapper& lhs,
|
367
|
+
const RhsMapper& rhs,
|
368
|
+
ResScalar* res, Index resIncr,
|
369
|
+
ResScalar alpha)
|
370
|
+
{
|
371
|
+
eigen_internal_assert(rhs.stride()==1);
|
372
|
+
|
373
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
374
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
375
|
+
#endif
|
376
|
+
|
377
|
+
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
|
378
|
+
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
|
379
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
|
380
|
+
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
|
381
|
+
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
|
382
|
+
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
|
383
|
+
|
384
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
385
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
386
|
+
|
387
|
+
typedef typename LhsMapper::VectorMapper LhsScalars;
|
388
|
+
|
389
|
+
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
390
|
+
const Index rowsAtOnce = 4;
|
391
|
+
const Index peels = 2;
|
392
|
+
const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
393
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
394
|
+
const Index depth = cols;
|
395
|
+
const Index lhsStride = lhs.stride();
|
396
|
+
|
397
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
398
|
+
// Here we assume data are at least aligned on the base scalar type
|
399
|
+
// if that's not the case then vectorization is discarded, see below.
|
400
|
+
Index alignedStart = rhs.firstAligned(depth);
|
401
|
+
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
|
402
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
403
|
+
|
404
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
405
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
406
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
407
|
+
: FirstAligned;
|
408
|
+
|
409
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
410
|
+
const Index lhsAlignmentOffset = lhs.firstAligned(depth);
|
411
|
+
const Index rhsAlignmentOffset = rhs.firstAligned(rows);
|
412
|
+
|
413
|
+
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
414
|
+
Index skipRows = 0;
|
415
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
416
|
+
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
|
417
|
+
(lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
|
418
|
+
(rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
|
419
|
+
{
|
420
|
+
alignedSize = 0;
|
421
|
+
alignedStart = 0;
|
422
|
+
alignmentPattern = NoneAligned;
|
423
|
+
}
|
424
|
+
else if(LhsPacketSize > 4)
|
425
|
+
{
|
426
|
+
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
427
|
+
alignmentPattern = NoneAligned;
|
428
|
+
}
|
429
|
+
else if (LhsPacketSize>1)
|
430
|
+
{
|
431
|
+
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
432
|
+
|
433
|
+
while (skipRows<LhsPacketSize &&
|
434
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
|
435
|
+
++skipRows;
|
436
|
+
if (skipRows==LhsPacketSize)
|
437
|
+
{
|
438
|
+
// nothing can be aligned, no need to skip any column
|
439
|
+
alignmentPattern = NoneAligned;
|
440
|
+
skipRows = 0;
|
441
|
+
}
|
442
|
+
else
|
443
|
+
{
|
444
|
+
skipRows = (std::min)(skipRows,Index(rows));
|
445
|
+
// note that the skiped columns are processed later.
|
446
|
+
}
|
447
|
+
/* eigen_internal_assert( alignmentPattern==NoneAligned
|
448
|
+
|| LhsPacketSize==1
|
449
|
+
|| (skipRows + rowsAtOnce >= rows)
|
450
|
+
|| LhsPacketSize > depth
|
451
|
+
|| (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
|
452
|
+
}
|
453
|
+
else if(Vectorizable)
|
454
|
+
{
|
455
|
+
alignedStart = 0;
|
456
|
+
alignedSize = depth;
|
457
|
+
alignmentPattern = AllAligned;
|
458
|
+
}
|
459
|
+
|
460
|
+
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
461
|
+
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
462
|
+
|
463
|
+
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
464
|
+
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
465
|
+
{
|
466
|
+
// FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
|
467
|
+
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
468
|
+
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
469
|
+
|
470
|
+
// this helps the compiler generating good binary code
|
471
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
|
472
|
+
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
|
473
|
+
|
474
|
+
if (Vectorizable)
|
475
|
+
{
|
476
|
+
/* explicit vectorization */
|
477
|
+
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
478
|
+
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
479
|
+
|
480
|
+
// process initial unaligned coeffs
|
481
|
+
// FIXME this loop get vectorized by the compiler !
|
482
|
+
for (Index j=0; j<alignedStart; ++j)
|
483
|
+
{
|
484
|
+
RhsScalar b = rhs(j, 0);
|
485
|
+
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
486
|
+
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
487
|
+
}
|
488
|
+
|
489
|
+
if (alignedSize>alignedStart)
|
490
|
+
{
|
491
|
+
switch(alignmentPattern)
|
492
|
+
{
|
493
|
+
case AllAligned:
|
494
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
495
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
496
|
+
break;
|
497
|
+
case EvenAligned:
|
498
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
499
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
500
|
+
break;
|
501
|
+
case FirstAligned:
|
502
|
+
{
|
503
|
+
Index j = alignedStart;
|
504
|
+
if (peels>1)
|
505
|
+
{
|
506
|
+
/* Here we proccess 4 rows with with two peeled iterations to hide
|
507
|
+
* the overhead of unaligned loads. Moreover unaligned loads are handled
|
508
|
+
* using special shift/move operations between the two aligned packets
|
509
|
+
* overlaping the desired unaligned packet. This is *much* more efficient
|
510
|
+
* than basic unaligned loads.
|
511
|
+
*/
|
512
|
+
LhsPacket A01, A02, A03, A11, A12, A13;
|
513
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
514
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
515
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
516
|
+
|
517
|
+
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
518
|
+
{
|
519
|
+
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
|
520
|
+
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
521
|
+
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
522
|
+
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
523
|
+
|
524
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
|
525
|
+
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
526
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
527
|
+
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
528
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
529
|
+
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
530
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
531
|
+
|
532
|
+
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
|
533
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
|
534
|
+
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
535
|
+
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
536
|
+
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
537
|
+
}
|
538
|
+
}
|
539
|
+
for (; j<alignedSize; j+=RhsPacketSize)
|
540
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
541
|
+
break;
|
542
|
+
}
|
543
|
+
default:
|
544
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
545
|
+
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
546
|
+
break;
|
547
|
+
}
|
548
|
+
tmp0 += predux(ptmp0);
|
549
|
+
tmp1 += predux(ptmp1);
|
550
|
+
tmp2 += predux(ptmp2);
|
551
|
+
tmp3 += predux(ptmp3);
|
552
|
+
}
|
553
|
+
} // end explicit vectorization
|
554
|
+
|
555
|
+
// process remaining coeffs (or all if no explicit vectorization)
|
556
|
+
// FIXME this loop get vectorized by the compiler !
|
557
|
+
for (Index j=alignedSize; j<depth; ++j)
|
558
|
+
{
|
559
|
+
RhsScalar b = rhs(j, 0);
|
560
|
+
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
561
|
+
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
562
|
+
}
|
563
|
+
res[i*resIncr] += alpha*tmp0;
|
564
|
+
res[(i+offset1)*resIncr] += alpha*tmp1;
|
565
|
+
res[(i+2)*resIncr] += alpha*tmp2;
|
566
|
+
res[(i+offset3)*resIncr] += alpha*tmp3;
|
567
|
+
}
|
568
|
+
|
569
|
+
// process remaining first and last rows (at most columnsAtOnce-1)
|
570
|
+
Index end = rows;
|
571
|
+
Index start = rowBound;
|
572
|
+
do
|
573
|
+
{
|
574
|
+
for (Index i=start; i<end; ++i)
|
575
|
+
{
|
576
|
+
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
577
|
+
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
578
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
|
579
|
+
// process first unaligned result's coeffs
|
580
|
+
// FIXME this loop get vectorized by the compiler !
|
581
|
+
for (Index j=0; j<alignedStart; ++j)
|
582
|
+
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
583
|
+
|
584
|
+
if (alignedSize>alignedStart)
|
585
|
+
{
|
586
|
+
// process aligned rhs coeffs
|
587
|
+
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
588
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
589
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
590
|
+
else
|
591
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
592
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
593
|
+
tmp0 += predux(ptmp0);
|
594
|
+
}
|
595
|
+
|
596
|
+
// process remaining scalars
|
597
|
+
// FIXME this loop get vectorized by the compiler !
|
598
|
+
for (Index j=alignedSize; j<depth; ++j)
|
599
|
+
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
600
|
+
res[i*resIncr] += alpha*tmp0;
|
601
|
+
}
|
602
|
+
if (skipRows)
|
603
|
+
{
|
604
|
+
start = 0;
|
605
|
+
end = skipRows;
|
606
|
+
skipRows = 0;
|
607
|
+
}
|
608
|
+
else
|
609
|
+
break;
|
610
|
+
} while(Vectorizable);
|
611
|
+
|
612
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
613
|
+
}
|
614
|
+
|
615
|
+
} // end namespace internal
|
616
|
+
|
617
|
+
} // end namespace Eigen
|
618
|
+
|
619
|
+
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|