tomoto 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +123 -0
- data/ext/tomoto/ext.cpp +245 -0
- data/ext/tomoto/extconf.rb +28 -0
- data/lib/tomoto.rb +12 -0
- data/lib/tomoto/ct.rb +11 -0
- data/lib/tomoto/hdp.rb +11 -0
- data/lib/tomoto/lda.rb +67 -0
- data/lib/tomoto/version.rb +3 -0
- data/vendor/EigenRand/EigenRand/Core.h +1139 -0
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
- data/vendor/EigenRand/EigenRand/EigenRand +19 -0
- data/vendor/EigenRand/EigenRand/Macro.h +24 -0
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
- data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
- data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
- data/vendor/EigenRand/EigenRand/doc.h +220 -0
- data/vendor/EigenRand/LICENSE +21 -0
- data/vendor/EigenRand/README.md +288 -0
- data/vendor/eigen/COPYING.BSD +26 -0
- data/vendor/eigen/COPYING.GPL +674 -0
- data/vendor/eigen/COPYING.LGPL +502 -0
- data/vendor/eigen/COPYING.MINPACK +52 -0
- data/vendor/eigen/COPYING.MPL2 +373 -0
- data/vendor/eigen/COPYING.README +18 -0
- data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
- data/vendor/eigen/Eigen/Cholesky +46 -0
- data/vendor/eigen/Eigen/CholmodSupport +48 -0
- data/vendor/eigen/Eigen/Core +537 -0
- data/vendor/eigen/Eigen/Dense +7 -0
- data/vendor/eigen/Eigen/Eigen +2 -0
- data/vendor/eigen/Eigen/Eigenvalues +61 -0
- data/vendor/eigen/Eigen/Geometry +62 -0
- data/vendor/eigen/Eigen/Householder +30 -0
- data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
- data/vendor/eigen/Eigen/Jacobi +33 -0
- data/vendor/eigen/Eigen/LU +50 -0
- data/vendor/eigen/Eigen/MetisSupport +35 -0
- data/vendor/eigen/Eigen/OrderingMethods +73 -0
- data/vendor/eigen/Eigen/PaStiXSupport +48 -0
- data/vendor/eigen/Eigen/PardisoSupport +35 -0
- data/vendor/eigen/Eigen/QR +51 -0
- data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
- data/vendor/eigen/Eigen/SPQRSupport +34 -0
- data/vendor/eigen/Eigen/SVD +51 -0
- data/vendor/eigen/Eigen/Sparse +36 -0
- data/vendor/eigen/Eigen/SparseCholesky +45 -0
- data/vendor/eigen/Eigen/SparseCore +69 -0
- data/vendor/eigen/Eigen/SparseLU +46 -0
- data/vendor/eigen/Eigen/SparseQR +37 -0
- data/vendor/eigen/Eigen/StdDeque +27 -0
- data/vendor/eigen/Eigen/StdList +26 -0
- data/vendor/eigen/Eigen/StdVector +27 -0
- data/vendor/eigen/Eigen/SuperLUSupport +64 -0
- data/vendor/eigen/Eigen/UmfPackSupport +40 -0
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
- data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
- data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
- data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
- data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
- data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
- data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
- data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
- data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
- data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
- data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
- data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
- data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
- data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
- data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
- data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
- data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
- data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
- data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
- data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
- data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
- data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
- data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
- data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
- data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
- data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
- data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
- data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
- data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
- data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
- data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
- data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
- data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
- data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
- data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
- data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
- data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
- data/vendor/eigen/README.md +3 -0
- data/vendor/eigen/bench/README.txt +55 -0
- data/vendor/eigen/bench/btl/COPYING +340 -0
- data/vendor/eigen/bench/btl/README +154 -0
- data/vendor/eigen/bench/tensors/README +21 -0
- data/vendor/eigen/blas/README.txt +6 -0
- data/vendor/eigen/demos/mandelbrot/README +10 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
- data/vendor/eigen/demos/opengl/README +13 -0
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
- data/vendor/eigen/unsupported/README.txt +50 -0
- data/vendor/tomotopy/LICENSE +21 -0
- data/vendor/tomotopy/README.kr.rst +375 -0
- data/vendor/tomotopy/README.rst +382 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
- data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
- data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
- data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
- data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
- data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
- data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
- data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
- data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
- data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
- data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
- data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
- data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
- data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
- data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
- data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
- data/vendor/tomotopy/src/Utils/exception.h +28 -0
- data/vendor/tomotopy/src/Utils/math.h +281 -0
- data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
- data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
- data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
- data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
- data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
- data/vendor/tomotopy/src/Utils/text.hpp +49 -0
- data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
- metadata +531 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Copyright (c) 2011, Intel Corporation. All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
5
|
+
are permitted provided that the following conditions are met:
|
|
6
|
+
|
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
8
|
+
list of conditions and the following disclaimer.
|
|
9
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
|
11
|
+
and/or other materials provided with the distribution.
|
|
12
|
+
* Neither the name of Intel Corporation nor the names of its contributors may
|
|
13
|
+
be used to endorse or promote products derived from this software without
|
|
14
|
+
specific prior written permission.
|
|
15
|
+
|
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
20
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
23
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
|
|
27
|
+
********************************************************************************
|
|
28
|
+
* Content : Eigen bindings to BLAS F77
|
|
29
|
+
* General matrix-matrix product functionality based on ?GEMM.
|
|
30
|
+
********************************************************************************
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
|
|
34
|
+
#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
|
|
35
|
+
|
|
36
|
+
namespace Eigen {
|
|
37
|
+
|
|
38
|
+
namespace internal {
|
|
39
|
+
|
|
40
|
+
/**********************************************************************
|
|
41
|
+
* This file implements general matrix-matrix multiplication using BLAS
|
|
42
|
+
* gemm function via partial specialization of
|
|
43
|
+
* general_matrix_matrix_product::run(..) method for float, double,
|
|
44
|
+
* std::complex<float> and std::complex<double> types
|
|
45
|
+
**********************************************************************/
|
|
46
|
+
|
|
47
|
+
// gemm specialization
|
|
48
|
+
|
|
49
|
+
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
|
|
50
|
+
template< \
|
|
51
|
+
typename Index, \
|
|
52
|
+
int LhsStorageOrder, bool ConjugateLhs, \
|
|
53
|
+
int RhsStorageOrder, bool ConjugateRhs> \
|
|
54
|
+
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
|
|
55
|
+
{ \
|
|
56
|
+
typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
|
|
57
|
+
\
|
|
58
|
+
static void run(Index rows, Index cols, Index depth, \
|
|
59
|
+
const EIGTYPE* _lhs, Index lhsStride, \
|
|
60
|
+
const EIGTYPE* _rhs, Index rhsStride, \
|
|
61
|
+
EIGTYPE* res, Index resStride, \
|
|
62
|
+
EIGTYPE alpha, \
|
|
63
|
+
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
|
|
64
|
+
GemmParallelInfo<Index>* /*info = 0*/) \
|
|
65
|
+
{ \
|
|
66
|
+
using std::conj; \
|
|
67
|
+
\
|
|
68
|
+
char transa, transb; \
|
|
69
|
+
BlasIndex m, n, k, lda, ldb, ldc; \
|
|
70
|
+
const EIGTYPE *a, *b; \
|
|
71
|
+
EIGTYPE beta(1); \
|
|
72
|
+
MatrixX##EIGPREFIX a_tmp, b_tmp; \
|
|
73
|
+
\
|
|
74
|
+
/* Set transpose options */ \
|
|
75
|
+
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
|
|
76
|
+
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
|
|
77
|
+
\
|
|
78
|
+
/* Set m, n, k */ \
|
|
79
|
+
m = convert_index<BlasIndex>(rows); \
|
|
80
|
+
n = convert_index<BlasIndex>(cols); \
|
|
81
|
+
k = convert_index<BlasIndex>(depth); \
|
|
82
|
+
\
|
|
83
|
+
/* Set lda, ldb, ldc */ \
|
|
84
|
+
lda = convert_index<BlasIndex>(lhsStride); \
|
|
85
|
+
ldb = convert_index<BlasIndex>(rhsStride); \
|
|
86
|
+
ldc = convert_index<BlasIndex>(resStride); \
|
|
87
|
+
\
|
|
88
|
+
/* Set a, b, c */ \
|
|
89
|
+
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
|
|
90
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
|
|
91
|
+
a_tmp = lhs.conjugate(); \
|
|
92
|
+
a = a_tmp.data(); \
|
|
93
|
+
lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
|
|
94
|
+
} else a = _lhs; \
|
|
95
|
+
\
|
|
96
|
+
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
|
|
97
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
|
|
98
|
+
b_tmp = rhs.conjugate(); \
|
|
99
|
+
b = b_tmp.data(); \
|
|
100
|
+
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
|
101
|
+
} else b = _rhs; \
|
|
102
|
+
\
|
|
103
|
+
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
|
104
|
+
}};
|
|
105
|
+
|
|
106
|
+
#ifdef EIGEN_USE_MKL
|
|
107
|
+
GEMM_SPECIALIZATION(double, d, double, dgemm)
|
|
108
|
+
GEMM_SPECIALIZATION(float, f, float, sgemm)
|
|
109
|
+
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
|
|
110
|
+
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
|
|
111
|
+
#else
|
|
112
|
+
GEMM_SPECIALIZATION(double, d, double, dgemm_)
|
|
113
|
+
GEMM_SPECIALIZATION(float, f, float, sgemm_)
|
|
114
|
+
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
|
|
115
|
+
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
|
|
116
|
+
#endif
|
|
117
|
+
|
|
118
|
+
} // end namespase internal
|
|
119
|
+
|
|
120
|
+
} // end namespace Eigen
|
|
121
|
+
|
|
122
|
+
#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
|
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
|
2
|
+
// for linear algebra.
|
|
3
|
+
//
|
|
4
|
+
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
|
|
5
|
+
//
|
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
9
|
+
|
|
10
|
+
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
11
|
+
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
12
|
+
|
|
13
|
+
namespace Eigen {
|
|
14
|
+
|
|
15
|
+
namespace internal {
|
|
16
|
+
|
|
17
|
+
/* Optimized col-major matrix * vector product:
|
|
18
|
+
* This algorithm processes 4 columns at onces that allows to both reduce
|
|
19
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
|
20
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
|
21
|
+
* same alignment pattern.
|
|
22
|
+
*
|
|
23
|
+
* Mixing type logic: C += alpha * A * B
|
|
24
|
+
* | A | B |alpha| comments
|
|
25
|
+
* |real |cplx |cplx | no vectorization
|
|
26
|
+
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
|
|
27
|
+
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
|
28
|
+
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
|
29
|
+
*
|
|
30
|
+
* Accesses to the matrix coefficients follow the following logic:
|
|
31
|
+
*
|
|
32
|
+
* - if all columns have the same alignment then
|
|
33
|
+
* - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
|
|
34
|
+
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
|
35
|
+
* - otherwise
|
|
36
|
+
* - if even columns have the same alignment then
|
|
37
|
+
* // odd columns are guaranteed to have the same alignment too
|
|
38
|
+
* - if even or odd columns have the same alignment as the result, then
|
|
39
|
+
* // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
|
|
40
|
+
* - perform half aligned and half unaligned loads (-> EvenAligned case)
|
|
41
|
+
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
|
42
|
+
* - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
|
|
43
|
+
* - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
|
|
44
|
+
* perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
|
|
45
|
+
* // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
|
|
46
|
+
* - otherwise,
|
|
47
|
+
* // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
|
|
48
|
+
* // we currently fall back to the NoneAligned case
|
|
49
|
+
*
|
|
50
|
+
* The same reasoning apply for the transposed case.
|
|
51
|
+
*
|
|
52
|
+
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
|
|
53
|
+
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
|
|
54
|
+
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
|
|
55
|
+
* compared to unaligned loads on a 4 byte boundary.
|
|
56
|
+
*
|
|
57
|
+
*/
|
|
58
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
59
|
+
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
|
60
|
+
{
|
|
61
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
62
|
+
|
|
63
|
+
enum {
|
|
64
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
|
65
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
|
66
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
67
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
68
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
72
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
73
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
74
|
+
|
|
75
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
76
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
77
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
78
|
+
|
|
79
|
+
EIGEN_DONT_INLINE static void run(
|
|
80
|
+
Index rows, Index cols,
|
|
81
|
+
const LhsMapper& lhs,
|
|
82
|
+
const RhsMapper& rhs,
|
|
83
|
+
ResScalar* res, Index resIncr,
|
|
84
|
+
RhsScalar alpha);
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
88
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
|
89
|
+
Index rows, Index cols,
|
|
90
|
+
const LhsMapper& lhs,
|
|
91
|
+
const RhsMapper& rhs,
|
|
92
|
+
ResScalar* res, Index resIncr,
|
|
93
|
+
RhsScalar alpha)
|
|
94
|
+
{
|
|
95
|
+
EIGEN_UNUSED_VARIABLE(resIncr);
|
|
96
|
+
eigen_internal_assert(resIncr==1);
|
|
97
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
|
98
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
99
|
+
#endif
|
|
100
|
+
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
|
|
101
|
+
pstore(&res[j], \
|
|
102
|
+
padd(pload<ResPacket>(&res[j]), \
|
|
103
|
+
padd( \
|
|
104
|
+
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
|
|
105
|
+
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
|
|
106
|
+
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
|
|
107
|
+
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
|
|
108
|
+
|
|
109
|
+
typedef typename LhsMapper::VectorMapper LhsScalars;
|
|
110
|
+
|
|
111
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
112
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
113
|
+
if(ConjugateRhs)
|
|
114
|
+
alpha = numext::conj(alpha);
|
|
115
|
+
|
|
116
|
+
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
|
117
|
+
const Index columnsAtOnce = 4;
|
|
118
|
+
const Index peels = 2;
|
|
119
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
120
|
+
const Index ResPacketAlignedMask = ResPacketSize-1;
|
|
121
|
+
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
|
122
|
+
const Index size = rows;
|
|
123
|
+
|
|
124
|
+
const Index lhsStride = lhs.stride();
|
|
125
|
+
|
|
126
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
|
127
|
+
// Here we assume data are at least aligned on the base scalar type.
|
|
128
|
+
Index alignedStart = internal::first_default_aligned(res,size);
|
|
129
|
+
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
|
|
130
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
|
131
|
+
|
|
132
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
|
133
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
|
134
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
|
135
|
+
: FirstAligned;
|
|
136
|
+
|
|
137
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
|
138
|
+
const Index lhsAlignmentOffset = lhs.firstAligned(size);
|
|
139
|
+
|
|
140
|
+
// find how many columns do we have to skip to be aligned with the result (if possible)
|
|
141
|
+
Index skipColumns = 0;
|
|
142
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
|
143
|
+
if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
|
|
144
|
+
{
|
|
145
|
+
alignedSize = 0;
|
|
146
|
+
alignedStart = 0;
|
|
147
|
+
alignmentPattern = NoneAligned;
|
|
148
|
+
}
|
|
149
|
+
else if(LhsPacketSize > 4)
|
|
150
|
+
{
|
|
151
|
+
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
|
152
|
+
// Currently, it seems to be better to perform unaligned loads anyway
|
|
153
|
+
alignmentPattern = NoneAligned;
|
|
154
|
+
}
|
|
155
|
+
else if (LhsPacketSize>1)
|
|
156
|
+
{
|
|
157
|
+
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
|
158
|
+
|
|
159
|
+
while (skipColumns<LhsPacketSize &&
|
|
160
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
|
161
|
+
++skipColumns;
|
|
162
|
+
if (skipColumns==LhsPacketSize)
|
|
163
|
+
{
|
|
164
|
+
// nothing can be aligned, no need to skip any column
|
|
165
|
+
alignmentPattern = NoneAligned;
|
|
166
|
+
skipColumns = 0;
|
|
167
|
+
}
|
|
168
|
+
else
|
|
169
|
+
{
|
|
170
|
+
skipColumns = (std::min)(skipColumns,cols);
|
|
171
|
+
// note that the skiped columns are processed later.
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/* eigen_internal_assert( (alignmentPattern==NoneAligned)
|
|
175
|
+
|| (skipColumns + columnsAtOnce >= cols)
|
|
176
|
+
|| LhsPacketSize > size
|
|
177
|
+
|| (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
|
|
178
|
+
}
|
|
179
|
+
else if(Vectorizable)
|
|
180
|
+
{
|
|
181
|
+
alignedStart = 0;
|
|
182
|
+
alignedSize = size;
|
|
183
|
+
alignmentPattern = AllAligned;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
|
187
|
+
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
|
188
|
+
|
|
189
|
+
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
|
190
|
+
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
|
191
|
+
{
|
|
192
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
|
|
193
|
+
ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
|
|
194
|
+
ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
|
|
195
|
+
ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
|
|
196
|
+
|
|
197
|
+
// this helps a lot generating better binary code
|
|
198
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
|
|
199
|
+
lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
|
|
200
|
+
|
|
201
|
+
if (Vectorizable)
|
|
202
|
+
{
|
|
203
|
+
/* explicit vectorization */
|
|
204
|
+
// process initial unaligned coeffs
|
|
205
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
206
|
+
{
|
|
207
|
+
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
|
208
|
+
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
|
209
|
+
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
|
210
|
+
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (alignedSize>alignedStart)
|
|
214
|
+
{
|
|
215
|
+
switch(alignmentPattern)
|
|
216
|
+
{
|
|
217
|
+
case AllAligned:
|
|
218
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
219
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
|
220
|
+
break;
|
|
221
|
+
case EvenAligned:
|
|
222
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
223
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
|
224
|
+
break;
|
|
225
|
+
case FirstAligned:
|
|
226
|
+
{
|
|
227
|
+
Index j = alignedStart;
|
|
228
|
+
if(peels>1)
|
|
229
|
+
{
|
|
230
|
+
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
|
231
|
+
ResPacket T0, T1;
|
|
232
|
+
|
|
233
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
|
234
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
|
235
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
|
236
|
+
|
|
237
|
+
for (; j<peeledSize; j+=peels*ResPacketSize)
|
|
238
|
+
{
|
|
239
|
+
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
|
240
|
+
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
|
241
|
+
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
|
242
|
+
|
|
243
|
+
A00 = lhs0.template load<LhsPacket, Aligned>(j);
|
|
244
|
+
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
|
|
245
|
+
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
|
246
|
+
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
|
247
|
+
|
|
248
|
+
T0 = pcj.pmadd(A01, ptmp1, T0);
|
|
249
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
|
250
|
+
T0 = pcj.pmadd(A02, ptmp2, T0);
|
|
251
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
|
252
|
+
T0 = pcj.pmadd(A03, ptmp3, T0);
|
|
253
|
+
pstore(&res[j],T0);
|
|
254
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
|
255
|
+
T1 = pcj.pmadd(A11, ptmp1, T1);
|
|
256
|
+
T1 = pcj.pmadd(A12, ptmp2, T1);
|
|
257
|
+
T1 = pcj.pmadd(A13, ptmp3, T1);
|
|
258
|
+
pstore(&res[j+ResPacketSize],T1);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
for (; j<alignedSize; j+=ResPacketSize)
|
|
262
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
|
263
|
+
break;
|
|
264
|
+
}
|
|
265
|
+
default:
|
|
266
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
267
|
+
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
|
268
|
+
break;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
} // end explicit vectorization
|
|
272
|
+
|
|
273
|
+
/* process remaining coeffs (or all if there is no explicit vectorization) */
|
|
274
|
+
for (Index j=alignedSize; j<size; ++j)
|
|
275
|
+
{
|
|
276
|
+
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
|
277
|
+
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
|
278
|
+
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
|
279
|
+
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// process remaining first and last columns (at most columnsAtOnce-1)
|
|
284
|
+
Index end = cols;
|
|
285
|
+
Index start = columnBound;
|
|
286
|
+
do
|
|
287
|
+
{
|
|
288
|
+
for (Index k=start; k<end; ++k)
|
|
289
|
+
{
|
|
290
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
|
|
291
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
|
|
292
|
+
|
|
293
|
+
if (Vectorizable)
|
|
294
|
+
{
|
|
295
|
+
/* explicit vectorization */
|
|
296
|
+
// process first unaligned result's coeffs
|
|
297
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
298
|
+
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
|
|
299
|
+
// process aligned result's coeffs
|
|
300
|
+
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
|
301
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
302
|
+
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
|
303
|
+
else
|
|
304
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
305
|
+
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// process remaining scalars (or all if no explicit vectorization)
|
|
309
|
+
for (Index i=alignedSize; i<size; ++i)
|
|
310
|
+
res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
|
|
311
|
+
}
|
|
312
|
+
if (skipColumns)
|
|
313
|
+
{
|
|
314
|
+
start = 0;
|
|
315
|
+
end = skipColumns;
|
|
316
|
+
skipColumns = 0;
|
|
317
|
+
}
|
|
318
|
+
else
|
|
319
|
+
break;
|
|
320
|
+
} while(Vectorizable);
|
|
321
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/* Optimized row-major matrix * vector product:
|
|
325
|
+
* This algorithm processes 4 rows at onces that allows to both reduce
|
|
326
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
|
327
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
|
328
|
+
* same alignment pattern.
|
|
329
|
+
*
|
|
330
|
+
* Mixing type logic:
|
|
331
|
+
* - alpha is always a complex (or converted to a complex)
|
|
332
|
+
* - no vectorization
|
|
333
|
+
*/
|
|
334
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
335
|
+
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
|
336
|
+
{
|
|
337
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
338
|
+
|
|
339
|
+
enum {
|
|
340
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
|
341
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
|
342
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
343
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
344
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
345
|
+
};
|
|
346
|
+
|
|
347
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
348
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
349
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
350
|
+
|
|
351
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
352
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
353
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
354
|
+
|
|
355
|
+
EIGEN_DONT_INLINE static void run(
|
|
356
|
+
Index rows, Index cols,
|
|
357
|
+
const LhsMapper& lhs,
|
|
358
|
+
const RhsMapper& rhs,
|
|
359
|
+
ResScalar* res, Index resIncr,
|
|
360
|
+
ResScalar alpha);
|
|
361
|
+
};
|
|
362
|
+
|
|
363
|
+
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
364
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
|
365
|
+
Index rows, Index cols,
|
|
366
|
+
const LhsMapper& lhs,
|
|
367
|
+
const RhsMapper& rhs,
|
|
368
|
+
ResScalar* res, Index resIncr,
|
|
369
|
+
ResScalar alpha)
|
|
370
|
+
{
|
|
371
|
+
eigen_internal_assert(rhs.stride()==1);
|
|
372
|
+
|
|
373
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
|
374
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
375
|
+
#endif
|
|
376
|
+
|
|
377
|
+
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
|
|
378
|
+
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
|
|
379
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
|
|
380
|
+
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
|
|
381
|
+
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
|
|
382
|
+
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
|
|
383
|
+
|
|
384
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
385
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
386
|
+
|
|
387
|
+
typedef typename LhsMapper::VectorMapper LhsScalars;
|
|
388
|
+
|
|
389
|
+
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
|
390
|
+
const Index rowsAtOnce = 4;
|
|
391
|
+
const Index peels = 2;
|
|
392
|
+
const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
|
393
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
394
|
+
const Index depth = cols;
|
|
395
|
+
const Index lhsStride = lhs.stride();
|
|
396
|
+
|
|
397
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
|
398
|
+
// Here we assume data are at least aligned on the base scalar type
|
|
399
|
+
// if that's not the case then vectorization is discarded, see below.
|
|
400
|
+
Index alignedStart = rhs.firstAligned(depth);
|
|
401
|
+
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
|
|
402
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
|
403
|
+
|
|
404
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
|
405
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
|
406
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
|
407
|
+
: FirstAligned;
|
|
408
|
+
|
|
409
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
|
410
|
+
const Index lhsAlignmentOffset = lhs.firstAligned(depth);
|
|
411
|
+
const Index rhsAlignmentOffset = rhs.firstAligned(rows);
|
|
412
|
+
|
|
413
|
+
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
|
414
|
+
Index skipRows = 0;
|
|
415
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
|
416
|
+
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
|
|
417
|
+
(lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
|
|
418
|
+
(rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
|
|
419
|
+
{
|
|
420
|
+
alignedSize = 0;
|
|
421
|
+
alignedStart = 0;
|
|
422
|
+
alignmentPattern = NoneAligned;
|
|
423
|
+
}
|
|
424
|
+
else if(LhsPacketSize > 4)
|
|
425
|
+
{
|
|
426
|
+
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
|
427
|
+
alignmentPattern = NoneAligned;
|
|
428
|
+
}
|
|
429
|
+
else if (LhsPacketSize>1)
|
|
430
|
+
{
|
|
431
|
+
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
|
432
|
+
|
|
433
|
+
while (skipRows<LhsPacketSize &&
|
|
434
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
|
|
435
|
+
++skipRows;
|
|
436
|
+
if (skipRows==LhsPacketSize)
|
|
437
|
+
{
|
|
438
|
+
// nothing can be aligned, no need to skip any column
|
|
439
|
+
alignmentPattern = NoneAligned;
|
|
440
|
+
skipRows = 0;
|
|
441
|
+
}
|
|
442
|
+
else
|
|
443
|
+
{
|
|
444
|
+
skipRows = (std::min)(skipRows,Index(rows));
|
|
445
|
+
// note that the skiped columns are processed later.
|
|
446
|
+
}
|
|
447
|
+
/* eigen_internal_assert( alignmentPattern==NoneAligned
|
|
448
|
+
|| LhsPacketSize==1
|
|
449
|
+
|| (skipRows + rowsAtOnce >= rows)
|
|
450
|
+
|| LhsPacketSize > depth
|
|
451
|
+
|| (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
|
|
452
|
+
}
|
|
453
|
+
else if(Vectorizable)
|
|
454
|
+
{
|
|
455
|
+
alignedStart = 0;
|
|
456
|
+
alignedSize = depth;
|
|
457
|
+
alignmentPattern = AllAligned;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
|
461
|
+
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
|
462
|
+
|
|
463
|
+
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
|
464
|
+
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
|
465
|
+
{
|
|
466
|
+
// FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
|
|
467
|
+
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
|
468
|
+
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
|
469
|
+
|
|
470
|
+
// this helps the compiler generating good binary code
|
|
471
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
|
|
472
|
+
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
|
|
473
|
+
|
|
474
|
+
if (Vectorizable)
|
|
475
|
+
{
|
|
476
|
+
/* explicit vectorization */
|
|
477
|
+
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
|
478
|
+
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
|
479
|
+
|
|
480
|
+
// process initial unaligned coeffs
|
|
481
|
+
// FIXME this loop get vectorized by the compiler !
|
|
482
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
483
|
+
{
|
|
484
|
+
RhsScalar b = rhs(j, 0);
|
|
485
|
+
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
|
486
|
+
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
if (alignedSize>alignedStart)
|
|
490
|
+
{
|
|
491
|
+
switch(alignmentPattern)
|
|
492
|
+
{
|
|
493
|
+
case AllAligned:
|
|
494
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
495
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
|
496
|
+
break;
|
|
497
|
+
case EvenAligned:
|
|
498
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
499
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
|
500
|
+
break;
|
|
501
|
+
case FirstAligned:
|
|
502
|
+
{
|
|
503
|
+
Index j = alignedStart;
|
|
504
|
+
if (peels>1)
|
|
505
|
+
{
|
|
506
|
+
/* Here we proccess 4 rows with with two peeled iterations to hide
|
|
507
|
+
* the overhead of unaligned loads. Moreover unaligned loads are handled
|
|
508
|
+
* using special shift/move operations between the two aligned packets
|
|
509
|
+
* overlaping the desired unaligned packet. This is *much* more efficient
|
|
510
|
+
* than basic unaligned loads.
|
|
511
|
+
*/
|
|
512
|
+
LhsPacket A01, A02, A03, A11, A12, A13;
|
|
513
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
|
514
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
|
515
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
|
516
|
+
|
|
517
|
+
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
|
518
|
+
{
|
|
519
|
+
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
|
|
520
|
+
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
|
521
|
+
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
|
522
|
+
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
|
523
|
+
|
|
524
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
|
|
525
|
+
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
|
526
|
+
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
|
527
|
+
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
|
528
|
+
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
|
529
|
+
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
|
530
|
+
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
|
531
|
+
|
|
532
|
+
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
|
|
533
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
|
|
534
|
+
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
|
535
|
+
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
|
536
|
+
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
for (; j<alignedSize; j+=RhsPacketSize)
|
|
540
|
+
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
|
541
|
+
break;
|
|
542
|
+
}
|
|
543
|
+
default:
|
|
544
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
545
|
+
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
|
546
|
+
break;
|
|
547
|
+
}
|
|
548
|
+
tmp0 += predux(ptmp0);
|
|
549
|
+
tmp1 += predux(ptmp1);
|
|
550
|
+
tmp2 += predux(ptmp2);
|
|
551
|
+
tmp3 += predux(ptmp3);
|
|
552
|
+
}
|
|
553
|
+
} // end explicit vectorization
|
|
554
|
+
|
|
555
|
+
// process remaining coeffs (or all if no explicit vectorization)
|
|
556
|
+
// FIXME this loop get vectorized by the compiler !
|
|
557
|
+
for (Index j=alignedSize; j<depth; ++j)
|
|
558
|
+
{
|
|
559
|
+
RhsScalar b = rhs(j, 0);
|
|
560
|
+
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
|
561
|
+
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
|
562
|
+
}
|
|
563
|
+
res[i*resIncr] += alpha*tmp0;
|
|
564
|
+
res[(i+offset1)*resIncr] += alpha*tmp1;
|
|
565
|
+
res[(i+2)*resIncr] += alpha*tmp2;
|
|
566
|
+
res[(i+offset3)*resIncr] += alpha*tmp3;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// process remaining first and last rows (at most columnsAtOnce-1)
|
|
570
|
+
Index end = rows;
|
|
571
|
+
Index start = rowBound;
|
|
572
|
+
do
|
|
573
|
+
{
|
|
574
|
+
for (Index i=start; i<end; ++i)
|
|
575
|
+
{
|
|
576
|
+
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
|
577
|
+
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
|
578
|
+
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
|
|
579
|
+
// process first unaligned result's coeffs
|
|
580
|
+
// FIXME this loop get vectorized by the compiler !
|
|
581
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
582
|
+
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
|
583
|
+
|
|
584
|
+
if (alignedSize>alignedStart)
|
|
585
|
+
{
|
|
586
|
+
// process aligned rhs coeffs
|
|
587
|
+
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
|
588
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
589
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
|
590
|
+
else
|
|
591
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
592
|
+
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
|
593
|
+
tmp0 += predux(ptmp0);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// process remaining scalars
|
|
597
|
+
// FIXME this loop get vectorized by the compiler !
|
|
598
|
+
for (Index j=alignedSize; j<depth; ++j)
|
|
599
|
+
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
|
600
|
+
res[i*resIncr] += alpha*tmp0;
|
|
601
|
+
}
|
|
602
|
+
if (skipRows)
|
|
603
|
+
{
|
|
604
|
+
start = 0;
|
|
605
|
+
end = skipRows;
|
|
606
|
+
skipRows = 0;
|
|
607
|
+
}
|
|
608
|
+
else
|
|
609
|
+
break;
|
|
610
|
+
} while(Vectorizable);
|
|
611
|
+
|
|
612
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
} // end namespace internal
|
|
616
|
+
|
|
617
|
+
} // end namespace Eigen
|
|
618
|
+
|
|
619
|
+
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|