ruby-eigen 0.0.9 → 0.0.10.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +22 -0
- data/README.md +21 -0
- data/ext/eigen/eigen3/COPYING.BSD +26 -0
- data/ext/eigen/eigen3/COPYING.MPL2 +373 -0
- data/ext/eigen/eigen3/COPYING.README +18 -0
- data/ext/eigen/eigen3/Eigen/Array +11 -0
- data/ext/eigen/eigen3/Eigen/Cholesky +32 -0
- data/ext/eigen/eigen3/Eigen/CholmodSupport +45 -0
- data/ext/eigen/eigen3/Eigen/Core +376 -0
- data/ext/eigen/eigen3/Eigen/Dense +7 -0
- data/ext/eigen/eigen3/Eigen/Eigen +2 -0
- data/ext/eigen/eigen3/Eigen/Eigen2Support +95 -0
- data/ext/eigen/eigen3/Eigen/Eigenvalues +48 -0
- data/ext/eigen/eigen3/Eigen/Geometry +63 -0
- data/ext/eigen/eigen3/Eigen/Householder +23 -0
- data/ext/eigen/eigen3/Eigen/IterativeLinearSolvers +40 -0
- data/ext/eigen/eigen3/Eigen/Jacobi +26 -0
- data/ext/eigen/eigen3/Eigen/LU +41 -0
- data/ext/eigen/eigen3/Eigen/LeastSquares +32 -0
- data/ext/eigen/eigen3/Eigen/MetisSupport +28 -0
- data/ext/eigen/eigen3/Eigen/PaStiXSupport +46 -0
- data/ext/eigen/eigen3/Eigen/PardisoSupport +30 -0
- data/ext/eigen/eigen3/Eigen/QR +45 -0
- data/ext/eigen/eigen3/Eigen/QtAlignedMalloc +34 -0
- data/ext/eigen/eigen3/Eigen/SPQRSupport +29 -0
- data/ext/eigen/eigen3/Eigen/SVD +37 -0
- data/ext/eigen/eigen3/Eigen/Sparse +27 -0
- data/ext/eigen/eigen3/Eigen/SparseCore +64 -0
- data/ext/eigen/eigen3/Eigen/SparseLU +49 -0
- data/ext/eigen/eigen3/Eigen/SparseQR +33 -0
- data/ext/eigen/eigen3/Eigen/StdDeque +27 -0
- data/ext/eigen/eigen3/Eigen/StdList +26 -0
- data/ext/eigen/eigen3/Eigen/StdVector +27 -0
- data/ext/eigen/eigen3/Eigen/SuperLUSupport +59 -0
- data/ext/eigen/eigen3/Eigen/UmfPackSupport +36 -0
- data/ext/eigen/eigen3/Eigen/src/Cholesky/LDLT.h +611 -0
- data/ext/eigen/eigen3/Eigen/src/Cholesky/LLT.h +498 -0
- data/ext/eigen/eigen3/Eigen/src/Cholesky/LLT_MKL.h +102 -0
- data/ext/eigen/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h +607 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Array.h +323 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ArrayBase.h +226 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ArrayWrapper.h +264 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Assign.h +590 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Assign_MKL.h +224 -0
- data/ext/eigen/eigen3/Eigen/src/Core/BandMatrix.h +334 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Block.h +406 -0
- data/ext/eigen/eigen3/Eigen/src/Core/BooleanRedux.h +154 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CommaInitializer.h +154 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CoreIterators.h +61 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseBinaryOp.h +230 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseNullaryOp.h +864 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseUnaryOp.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseUnaryView.h +139 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DenseBase.h +521 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DenseCoeffsBase.h +754 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DenseStorage.h +434 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Diagonal.h +237 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DiagonalMatrix.h +313 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DiagonalProduct.h +131 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Dot.h +263 -0
- data/ext/eigen/eigen3/Eigen/src/Core/EigenBase.h +131 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Flagged.h +140 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ForceAlignedAccess.h +146 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Functors.h +1026 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Fuzzy.h +150 -0
- data/ext/eigen/eigen3/Eigen/src/Core/GeneralProduct.h +635 -0
- data/ext/eigen/eigen3/Eigen/src/Core/GenericPacketMath.h +350 -0
- data/ext/eigen/eigen3/Eigen/src/Core/GlobalFunctions.h +92 -0
- data/ext/eigen/eigen3/Eigen/src/Core/IO.h +250 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Map.h +192 -0
- data/ext/eigen/eigen3/Eigen/src/Core/MapBase.h +247 -0
- data/ext/eigen/eigen3/Eigen/src/Core/MathFunctions.h +768 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Matrix.h +420 -0
- data/ext/eigen/eigen3/Eigen/src/Core/MatrixBase.h +563 -0
- data/ext/eigen/eigen3/Eigen/src/Core/NestByValue.h +111 -0
- data/ext/eigen/eigen3/Eigen/src/Core/NoAlias.h +134 -0
- data/ext/eigen/eigen3/Eigen/src/Core/NumTraits.h +150 -0
- data/ext/eigen/eigen3/Eigen/src/Core/PermutationMatrix.h +721 -0
- data/ext/eigen/eigen3/Eigen/src/Core/PlainObjectBase.h +822 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ProductBase.h +290 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Random.h +152 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Redux.h +409 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Ref.h +278 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Replicate.h +177 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ReturnByValue.h +99 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Reverse.h +224 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Select.h +162 -0
- data/ext/eigen/eigen3/Eigen/src/Core/SelfAdjointView.h +314 -0
- data/ext/eigen/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h +191 -0
- data/ext/eigen/eigen3/Eigen/src/Core/SolveTriangular.h +260 -0
- data/ext/eigen/eigen3/Eigen/src/Core/StableNorm.h +203 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Stride.h +108 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Swap.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Transpose.h +419 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Transpositions.h +436 -0
- data/ext/eigen/eigen3/Eigen/src/Core/TriangularMatrix.h +839 -0
- data/ext/eigen/eigen3/Eigen/src/Core/VectorBlock.h +95 -0
- data/ext/eigen/eigen3/Eigen/src/Core/VectorwiseOp.h +642 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Visitor.h +237 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h +217 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h +501 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/Default/Settings.h +49 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/NEON/Complex.h +253 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h +420 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/SSE/Complex.h +442 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +475 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +649 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h +476 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1341 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h +427 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +278 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +146 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +118 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h +566 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +131 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/Parallelizer.h +162 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +436 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +295 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h +281 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +114 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointProduct.h +123 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h +427 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +309 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h +348 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +247 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h +332 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +155 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularSolverVector.h +139 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/BlasUtil.h +264 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Constants.h +451 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h +40 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/MKL_support.h +158 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Macros.h +451 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Memory.h +977 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Meta.h +243 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/NonMPL2.h +3 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h +14 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/StaticAssert.h +208 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/XprHelper.h +469 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Block.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Cwise.h +192 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/CwiseOperators.h +298 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/AlignedBox.h +159 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/All.h +115 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/AngleAxis.h +214 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Hyperplane.h +254 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h +141 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Quaternion.h +495 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Rotation2D.h +145 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/RotationBase.h +123 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Scaling.h +167 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Transform.h +786 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Translation.h +184 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/LU.h +120 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Lazy.h +71 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/LeastSquares.h +169 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Macros.h +20 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/MathFunctions.h +57 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Memory.h +45 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Meta.h +75 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Minor.h +117 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/QR.h +67 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/SVD.h +637 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/TriangularSolver.h +42 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/VectorBlock.h +94 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h +341 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h +456 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/ComplexSchur_MKL.h +94 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/EigenSolver.h +607 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +350 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +227 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h +373 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +160 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/RealQZ.h +624 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/RealSchur.h +525 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/RealSchur_MKL.h +83 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +801 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h +92 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h +557 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/AlignedBox.h +392 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/AngleAxis.h +233 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/EulerAngles.h +104 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Homogeneous.h +307 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Hyperplane.h +280 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/OrthoMethods.h +218 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/ParametrizedLine.h +195 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Quaternion.h +776 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Rotation2D.h +160 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/RotationBase.h +206 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Scaling.h +166 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Transform.h +1455 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Translation.h +206 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Umeyama.h +177 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/arch/Geometry_SSE.h +115 -0
- data/ext/eigen/eigen3/Eigen/src/Householder/BlockHouseholder.h +68 -0
- data/ext/eigen/eigen3/Eigen/src/Householder/Householder.h +171 -0
- data/ext/eigen/eigen3/Eigen/src/Householder/HouseholderSequence.h +441 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +263 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +256 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +282 -0
- data/ext/eigen/eigen3/Eigen/src/Jacobi/Jacobi.h +433 -0
- data/ext/eigen/eigen3/Eigen/src/LU/Determinant.h +101 -0
- data/ext/eigen/eigen3/Eigen/src/LU/FullPivLU.h +751 -0
- data/ext/eigen/eigen3/Eigen/src/LU/Inverse.h +400 -0
- data/ext/eigen/eigen3/Eigen/src/LU/PartialPivLU.h +509 -0
- data/ext/eigen/eigen3/Eigen/src/LU/PartialPivLU_MKL.h +85 -0
- data/ext/eigen/eigen3/Eigen/src/LU/arch/Inverse_SSE.h +329 -0
- data/ext/eigen/eigen3/Eigen/src/MetisSupport/MetisSupport.h +137 -0
- data/ext/eigen/eigen3/Eigen/src/OrderingMethods/Amd.h +444 -0
- data/ext/eigen/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h +1850 -0
- data/ext/eigen/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h +721 -0
- data/ext/eigen/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h +592 -0
- data/ext/eigen/eigen3/Eigen/src/QR/ColPivHouseholderQR.h +580 -0
- data/ext/eigen/eigen3/Eigen/src/QR/ColPivHouseholderQR_MKL.h +99 -0
- data/ext/eigen/eigen3/Eigen/src/QR/FullPivHouseholderQR.h +622 -0
- data/ext/eigen/eigen3/Eigen/src/QR/HouseholderQR.h +388 -0
- data/ext/eigen/eigen3/Eigen/src/QR/HouseholderQR_MKL.h +71 -0
- data/ext/eigen/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +338 -0
- data/ext/eigen/eigen3/Eigen/src/SVD/JacobiSVD.h +976 -0
- data/ext/eigen/eigen3/Eigen/src/SVD/JacobiSVD_MKL.h +92 -0
- data/ext/eigen/eigen3/Eigen/src/SVD/UpperBidiagonalization.h +148 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h +671 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/AmbiVector.h +373 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/CompressedStorage.h +233 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +245 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h +181 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseBlock.h +537 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseColEtree.h +206 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +325 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +163 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +311 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h +196 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseDot.h +101 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseFuzzy.h +26 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseMatrix.h +1262 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h +461 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparsePermutation.h +148 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseProduct.h +188 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseRedux.h +45 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h +507 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +150 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseTranspose.h +63 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseTriangularView.h +179 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseUtil.h +172 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseVector.h +448 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseView.h +99 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/TriangularSolver.h +334 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU.h +806 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h +227 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h +111 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +298 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h +180 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h +177 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +106 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +279 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +127 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h +135 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
- data/ext/eigen/eigen3/Eigen/src/SparseQR/SparseQR.h +714 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/StdDeque.h +134 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/StdList.h +114 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/StdVector.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/details.h +84 -0
- data/ext/eigen/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h +1026 -0
- data/ext/eigen/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h +474 -0
- data/ext/eigen/eigen3/Eigen/src/misc/Image.h +84 -0
- data/ext/eigen/eigen3/Eigen/src/misc/Kernel.h +81 -0
- data/ext/eigen/eigen3/Eigen/src/misc/Solve.h +76 -0
- data/ext/eigen/eigen3/Eigen/src/misc/SparseSolve.h +128 -0
- data/ext/eigen/eigen3/Eigen/src/misc/blas.h +658 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h +253 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h +187 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/BlockMethods.h +935 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h +46 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h +172 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h +143 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h +52 -0
- data/ext/eigen/eigen3/signature_of_eigen3_matrix_library +1 -0
- data/ext/eigen/eigen_wrap.cxx +19420 -10396
- data/ext/eigen/extconf.rb +37 -2
- data/lib/eigen.rb +146 -3
- metadata +294 -7
@@ -0,0 +1,146 @@
|
|
1
|
+
/*
|
2
|
+
Copyright (c) 2011, Intel Corporation. All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
5
|
+
are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
8
|
+
list of conditions and the following disclaimer.
|
9
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
11
|
+
and/or other materials provided with the distribution.
|
12
|
+
* Neither the name of Intel Corporation nor the names of its contributors may
|
13
|
+
be used to endorse or promote products derived from this software without
|
14
|
+
specific prior written permission.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
17
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
18
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
20
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
21
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
22
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
23
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
|
27
|
+
********************************************************************************
|
28
|
+
* Content : Eigen bindings to Intel(R) MKL
|
29
|
+
* Level 3 BLAS SYRK/HERK implementation.
|
30
|
+
********************************************************************************
|
31
|
+
*/
|
32
|
+
|
33
|
+
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
|
34
|
+
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
|
35
|
+
|
36
|
+
namespace Eigen {
|
37
|
+
|
38
|
+
namespace internal {
|
39
|
+
|
40
|
+
template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
|
41
|
+
struct general_matrix_matrix_rankupdate :
|
42
|
+
general_matrix_matrix_triangular_product<
|
43
|
+
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
|
44
|
+
|
45
|
+
|
46
|
+
// try to go to BLAS specialization
|
47
|
+
#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
|
48
|
+
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
|
49
|
+
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
|
50
|
+
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
|
51
|
+
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
|
52
|
+
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
|
53
|
+
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
|
54
|
+
{ \
|
55
|
+
if (lhs==rhs) { \
|
56
|
+
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
|
57
|
+
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
|
58
|
+
} else { \
|
59
|
+
general_matrix_matrix_triangular_product<Index, \
|
60
|
+
Scalar, LhsStorageOrder, ConjugateLhs, \
|
61
|
+
Scalar, RhsStorageOrder, ConjugateRhs, \
|
62
|
+
ColMajor, UpLo, BuiltIn> \
|
63
|
+
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
|
64
|
+
} \
|
65
|
+
} \
|
66
|
+
};
|
67
|
+
|
68
|
+
EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
|
69
|
+
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
|
70
|
+
EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
|
71
|
+
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
|
72
|
+
|
73
|
+
// SYRK for float/double
|
74
|
+
#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
|
75
|
+
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
|
76
|
+
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
|
77
|
+
enum { \
|
78
|
+
IsLower = (UpLo&Lower) == Lower, \
|
79
|
+
LowUp = IsLower ? Lower : Upper, \
|
80
|
+
conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
|
81
|
+
}; \
|
82
|
+
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
|
83
|
+
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
|
84
|
+
{ \
|
85
|
+
/* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
|
86
|
+
\
|
87
|
+
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
|
88
|
+
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
|
89
|
+
MKLTYPE alpha_, beta_; \
|
90
|
+
\
|
91
|
+
/* Set alpha_ & beta_ */ \
|
92
|
+
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
|
93
|
+
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
|
94
|
+
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
|
95
|
+
} \
|
96
|
+
};
|
97
|
+
|
98
|
+
// HERK for complex data
|
99
|
+
#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
|
100
|
+
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
|
101
|
+
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
|
102
|
+
enum { \
|
103
|
+
IsLower = (UpLo&Lower) == Lower, \
|
104
|
+
LowUp = IsLower ? Lower : Upper, \
|
105
|
+
conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
|
106
|
+
}; \
|
107
|
+
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
|
108
|
+
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
|
109
|
+
{ \
|
110
|
+
typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
|
111
|
+
\
|
112
|
+
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
|
113
|
+
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
|
114
|
+
RTYPE alpha_, beta_; \
|
115
|
+
const EIGTYPE* a_ptr; \
|
116
|
+
\
|
117
|
+
/* Set alpha_ & beta_ */ \
|
118
|
+
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
|
119
|
+
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
|
120
|
+
alpha_ = alpha.real(); \
|
121
|
+
beta_ = 1.0; \
|
122
|
+
/* Copy with conjugation in some cases*/ \
|
123
|
+
MatrixType a; \
|
124
|
+
if (conjA) { \
|
125
|
+
Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
|
126
|
+
a = mapA.conjugate(); \
|
127
|
+
lda = a.outerStride(); \
|
128
|
+
a_ptr = a.data(); \
|
129
|
+
} else a_ptr=lhs; \
|
130
|
+
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
|
131
|
+
} \
|
132
|
+
};
|
133
|
+
|
134
|
+
|
135
|
+
EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
|
136
|
+
EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk)
|
137
|
+
|
138
|
+
//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
|
139
|
+
//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk)
|
140
|
+
|
141
|
+
|
142
|
+
} // end namespace internal
|
143
|
+
|
144
|
+
} // end namespace Eigen
|
145
|
+
|
146
|
+
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
|
@@ -0,0 +1,118 @@
|
|
1
|
+
/*
|
2
|
+
Copyright (c) 2011, Intel Corporation. All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
5
|
+
are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
8
|
+
list of conditions and the following disclaimer.
|
9
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
11
|
+
and/or other materials provided with the distribution.
|
12
|
+
* Neither the name of Intel Corporation nor the names of its contributors may
|
13
|
+
be used to endorse or promote products derived from this software without
|
14
|
+
specific prior written permission.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
17
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
18
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
20
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
21
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
22
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
23
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
|
27
|
+
********************************************************************************
|
28
|
+
* Content : Eigen bindings to Intel(R) MKL
|
29
|
+
* General matrix-matrix product functionality based on ?GEMM.
|
30
|
+
********************************************************************************
|
31
|
+
*/
|
32
|
+
|
33
|
+
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
|
34
|
+
#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
|
35
|
+
|
36
|
+
namespace Eigen {
|
37
|
+
|
38
|
+
namespace internal {
|
39
|
+
|
40
|
+
/**********************************************************************
|
41
|
+
* This file implements general matrix-matrix multiplication using BLAS
|
42
|
+
* gemm function via partial specialization of
|
43
|
+
* general_matrix_matrix_product::run(..) method for float, double,
|
44
|
+
* std::complex<float> and std::complex<double> types
|
45
|
+
**********************************************************************/
|
46
|
+
|
47
|
+
// gemm specialization
|
48
|
+
|
49
|
+
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
|
50
|
+
template< \
|
51
|
+
typename Index, \
|
52
|
+
int LhsStorageOrder, bool ConjugateLhs, \
|
53
|
+
int RhsStorageOrder, bool ConjugateRhs> \
|
54
|
+
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
|
55
|
+
{ \
|
56
|
+
static void run(Index rows, Index cols, Index depth, \
|
57
|
+
const EIGTYPE* _lhs, Index lhsStride, \
|
58
|
+
const EIGTYPE* _rhs, Index rhsStride, \
|
59
|
+
EIGTYPE* res, Index resStride, \
|
60
|
+
EIGTYPE alpha, \
|
61
|
+
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
|
62
|
+
GemmParallelInfo<Index>* /*info = 0*/) \
|
63
|
+
{ \
|
64
|
+
using std::conj; \
|
65
|
+
\
|
66
|
+
char transa, transb; \
|
67
|
+
MKL_INT m, n, k, lda, ldb, ldc; \
|
68
|
+
const EIGTYPE *a, *b; \
|
69
|
+
MKLTYPE alpha_, beta_; \
|
70
|
+
MatrixX##EIGPREFIX a_tmp, b_tmp; \
|
71
|
+
EIGTYPE myone(1);\
|
72
|
+
\
|
73
|
+
/* Set transpose options */ \
|
74
|
+
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
|
75
|
+
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
|
76
|
+
\
|
77
|
+
/* Set m, n, k */ \
|
78
|
+
m = (MKL_INT)rows; \
|
79
|
+
n = (MKL_INT)cols; \
|
80
|
+
k = (MKL_INT)depth; \
|
81
|
+
\
|
82
|
+
/* Set alpha_ & beta_ */ \
|
83
|
+
assign_scalar_eig2mkl(alpha_, alpha); \
|
84
|
+
assign_scalar_eig2mkl(beta_, myone); \
|
85
|
+
\
|
86
|
+
/* Set lda, ldb, ldc */ \
|
87
|
+
lda = (MKL_INT)lhsStride; \
|
88
|
+
ldb = (MKL_INT)rhsStride; \
|
89
|
+
ldc = (MKL_INT)resStride; \
|
90
|
+
\
|
91
|
+
/* Set a, b, c */ \
|
92
|
+
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
|
93
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
|
94
|
+
a_tmp = lhs.conjugate(); \
|
95
|
+
a = a_tmp.data(); \
|
96
|
+
lda = a_tmp.outerStride(); \
|
97
|
+
} else a = _lhs; \
|
98
|
+
\
|
99
|
+
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
|
100
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
|
101
|
+
b_tmp = rhs.conjugate(); \
|
102
|
+
b = b_tmp.data(); \
|
103
|
+
ldb = b_tmp.outerStride(); \
|
104
|
+
} else b = _rhs; \
|
105
|
+
\
|
106
|
+
MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
|
107
|
+
}};
|
108
|
+
|
109
|
+
GEMM_SPECIALIZATION(double, d, double, d)
|
110
|
+
GEMM_SPECIALIZATION(float, f, float, s)
|
111
|
+
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
|
112
|
+
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c)
|
113
|
+
|
114
|
+
} // end namespase internal
|
115
|
+
|
116
|
+
} // end namespace Eigen
|
117
|
+
|
118
|
+
#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
|
@@ -0,0 +1,566 @@
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
2
|
+
// for linear algebra.
|
3
|
+
//
|
4
|
+
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
|
5
|
+
//
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
9
|
+
|
10
|
+
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
11
|
+
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
12
|
+
|
13
|
+
namespace Eigen {
|
14
|
+
|
15
|
+
namespace internal {
|
16
|
+
|
17
|
+
/* Optimized col-major matrix * vector product:
|
18
|
+
* This algorithm processes 4 columns at onces that allows to both reduce
|
19
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
20
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
21
|
+
* same alignment pattern.
|
22
|
+
*
|
23
|
+
* Mixing type logic: C += alpha * A * B
|
24
|
+
* | A | B |alpha| comments
|
25
|
+
* |real |cplx |cplx | no vectorization
|
26
|
+
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
|
27
|
+
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
28
|
+
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
29
|
+
*/
|
30
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
31
|
+
struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
32
|
+
{
|
33
|
+
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
34
|
+
|
35
|
+
enum {
|
36
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
37
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
38
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
39
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
40
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
41
|
+
};
|
42
|
+
|
43
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
44
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
45
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
46
|
+
|
47
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
48
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
49
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
50
|
+
|
51
|
+
EIGEN_DONT_INLINE static void run(
|
52
|
+
Index rows, Index cols,
|
53
|
+
const LhsScalar* lhs, Index lhsStride,
|
54
|
+
const RhsScalar* rhs, Index rhsIncr,
|
55
|
+
ResScalar* res, Index resIncr, RhsScalar alpha);
|
56
|
+
};
|
57
|
+
|
58
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
59
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
60
|
+
Index rows, Index cols,
|
61
|
+
const LhsScalar* lhs, Index lhsStride,
|
62
|
+
const RhsScalar* rhs, Index rhsIncr,
|
63
|
+
ResScalar* res, Index resIncr, RhsScalar alpha)
|
64
|
+
{
|
65
|
+
EIGEN_UNUSED_VARIABLE(resIncr)
|
66
|
+
eigen_internal_assert(resIncr==1);
|
67
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
68
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
69
|
+
#endif
|
70
|
+
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
|
71
|
+
pstore(&res[j], \
|
72
|
+
padd(pload<ResPacket>(&res[j]), \
|
73
|
+
padd( \
|
74
|
+
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
|
75
|
+
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
|
76
|
+
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
|
77
|
+
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
|
78
|
+
|
79
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
80
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
81
|
+
if(ConjugateRhs)
|
82
|
+
alpha = numext::conj(alpha);
|
83
|
+
|
84
|
+
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
85
|
+
const Index columnsAtOnce = 4;
|
86
|
+
const Index peels = 2;
|
87
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
88
|
+
const Index ResPacketAlignedMask = ResPacketSize-1;
|
89
|
+
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
90
|
+
const Index size = rows;
|
91
|
+
|
92
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
93
|
+
// Here we assume data are at least aligned on the base scalar type.
|
94
|
+
Index alignedStart = internal::first_aligned(res,size);
|
95
|
+
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
|
96
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
97
|
+
|
98
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
99
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
100
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
101
|
+
: FirstAligned;
|
102
|
+
|
103
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
104
|
+
const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
|
105
|
+
|
106
|
+
// find how many columns do we have to skip to be aligned with the result (if possible)
|
107
|
+
Index skipColumns = 0;
|
108
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
109
|
+
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
|
110
|
+
{
|
111
|
+
alignedSize = 0;
|
112
|
+
alignedStart = 0;
|
113
|
+
}
|
114
|
+
else if (LhsPacketSize>1)
|
115
|
+
{
|
116
|
+
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
117
|
+
|
118
|
+
while (skipColumns<LhsPacketSize &&
|
119
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
120
|
+
++skipColumns;
|
121
|
+
if (skipColumns==LhsPacketSize)
|
122
|
+
{
|
123
|
+
// nothing can be aligned, no need to skip any column
|
124
|
+
alignmentPattern = NoneAligned;
|
125
|
+
skipColumns = 0;
|
126
|
+
}
|
127
|
+
else
|
128
|
+
{
|
129
|
+
skipColumns = (std::min)(skipColumns,cols);
|
130
|
+
// note that the skiped columns are processed later.
|
131
|
+
}
|
132
|
+
|
133
|
+
eigen_internal_assert( (alignmentPattern==NoneAligned)
|
134
|
+
|| (skipColumns + columnsAtOnce >= cols)
|
135
|
+
|| LhsPacketSize > size
|
136
|
+
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
|
137
|
+
}
|
138
|
+
else if(Vectorizable)
|
139
|
+
{
|
140
|
+
alignedStart = 0;
|
141
|
+
alignedSize = size;
|
142
|
+
alignmentPattern = AllAligned;
|
143
|
+
}
|
144
|
+
|
145
|
+
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
146
|
+
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
147
|
+
|
148
|
+
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
149
|
+
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
150
|
+
{
|
151
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
|
152
|
+
ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
|
153
|
+
ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
|
154
|
+
ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
|
155
|
+
|
156
|
+
// this helps a lot generating better binary code
|
157
|
+
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
158
|
+
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
159
|
+
|
160
|
+
if (Vectorizable)
|
161
|
+
{
|
162
|
+
/* explicit vectorization */
|
163
|
+
// process initial unaligned coeffs
|
164
|
+
for (Index j=0; j<alignedStart; ++j)
|
165
|
+
{
|
166
|
+
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
167
|
+
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
168
|
+
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
169
|
+
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
170
|
+
}
|
171
|
+
|
172
|
+
if (alignedSize>alignedStart)
|
173
|
+
{
|
174
|
+
switch(alignmentPattern)
|
175
|
+
{
|
176
|
+
case AllAligned:
|
177
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
178
|
+
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
179
|
+
break;
|
180
|
+
case EvenAligned:
|
181
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
182
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
183
|
+
break;
|
184
|
+
case FirstAligned:
|
185
|
+
{
|
186
|
+
Index j = alignedStart;
|
187
|
+
if(peels>1)
|
188
|
+
{
|
189
|
+
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
190
|
+
ResPacket T0, T1;
|
191
|
+
|
192
|
+
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
193
|
+
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
194
|
+
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
195
|
+
|
196
|
+
for (; j<peeledSize; j+=peels*ResPacketSize)
|
197
|
+
{
|
198
|
+
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
199
|
+
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
200
|
+
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
201
|
+
|
202
|
+
A00 = pload<LhsPacket>(&lhs0[j]);
|
203
|
+
A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
|
204
|
+
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
205
|
+
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
206
|
+
|
207
|
+
T0 = pcj.pmadd(A01, ptmp1, T0);
|
208
|
+
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
209
|
+
T0 = pcj.pmadd(A02, ptmp2, T0);
|
210
|
+
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
211
|
+
T0 = pcj.pmadd(A03, ptmp3, T0);
|
212
|
+
pstore(&res[j],T0);
|
213
|
+
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
214
|
+
T1 = pcj.pmadd(A11, ptmp1, T1);
|
215
|
+
T1 = pcj.pmadd(A12, ptmp2, T1);
|
216
|
+
T1 = pcj.pmadd(A13, ptmp3, T1);
|
217
|
+
pstore(&res[j+ResPacketSize],T1);
|
218
|
+
}
|
219
|
+
}
|
220
|
+
for (; j<alignedSize; j+=ResPacketSize)
|
221
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
222
|
+
break;
|
223
|
+
}
|
224
|
+
default:
|
225
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
226
|
+
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
227
|
+
break;
|
228
|
+
}
|
229
|
+
}
|
230
|
+
} // end explicit vectorization
|
231
|
+
|
232
|
+
/* process remaining coeffs (or all if there is no explicit vectorization) */
|
233
|
+
for (Index j=alignedSize; j<size; ++j)
|
234
|
+
{
|
235
|
+
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
236
|
+
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
237
|
+
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
238
|
+
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
239
|
+
}
|
240
|
+
}
|
241
|
+
|
242
|
+
// process remaining first and last columns (at most columnsAtOnce-1)
|
243
|
+
Index end = cols;
|
244
|
+
Index start = columnBound;
|
245
|
+
do
|
246
|
+
{
|
247
|
+
for (Index k=start; k<end; ++k)
|
248
|
+
{
|
249
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
|
250
|
+
const LhsScalar* lhs0 = lhs + k*lhsStride;
|
251
|
+
|
252
|
+
if (Vectorizable)
|
253
|
+
{
|
254
|
+
/* explicit vectorization */
|
255
|
+
// process first unaligned result's coeffs
|
256
|
+
for (Index j=0; j<alignedStart; ++j)
|
257
|
+
res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
|
258
|
+
// process aligned result's coeffs
|
259
|
+
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
260
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
261
|
+
pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
262
|
+
else
|
263
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
264
|
+
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
265
|
+
}
|
266
|
+
|
267
|
+
// process remaining scalars (or all if no explicit vectorization)
|
268
|
+
for (Index i=alignedSize; i<size; ++i)
|
269
|
+
res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
|
270
|
+
}
|
271
|
+
if (skipColumns)
|
272
|
+
{
|
273
|
+
start = 0;
|
274
|
+
end = skipColumns;
|
275
|
+
skipColumns = 0;
|
276
|
+
}
|
277
|
+
else
|
278
|
+
break;
|
279
|
+
} while(Vectorizable);
|
280
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
281
|
+
}
|
282
|
+
|
283
|
+
/* Optimized row-major matrix * vector product:
|
284
|
+
* This algorithm processes 4 rows at onces that allows to both reduce
|
285
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
286
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
287
|
+
* same alignment pattern.
|
288
|
+
*
|
289
|
+
* Mixing type logic:
|
290
|
+
* - alpha is always a complex (or converted to a complex)
|
291
|
+
* - no vectorization
|
292
|
+
*/
|
293
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
294
|
+
struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
295
|
+
{
|
296
|
+
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
297
|
+
|
298
|
+
enum {
|
299
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
300
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
301
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
302
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
303
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
304
|
+
};
|
305
|
+
|
306
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
307
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
308
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
309
|
+
|
310
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
311
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
312
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
313
|
+
|
314
|
+
EIGEN_DONT_INLINE static void run(
|
315
|
+
Index rows, Index cols,
|
316
|
+
const LhsScalar* lhs, Index lhsStride,
|
317
|
+
const RhsScalar* rhs, Index rhsIncr,
|
318
|
+
ResScalar* res, Index resIncr,
|
319
|
+
ResScalar alpha);
|
320
|
+
};
|
321
|
+
|
322
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
323
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
324
|
+
Index rows, Index cols,
|
325
|
+
const LhsScalar* lhs, Index lhsStride,
|
326
|
+
const RhsScalar* rhs, Index rhsIncr,
|
327
|
+
ResScalar* res, Index resIncr,
|
328
|
+
ResScalar alpha)
|
329
|
+
{
|
330
|
+
EIGEN_UNUSED_VARIABLE(rhsIncr);
|
331
|
+
eigen_internal_assert(rhsIncr==1);
|
332
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
333
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
334
|
+
#endif
|
335
|
+
|
336
|
+
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
|
337
|
+
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
|
338
|
+
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
|
339
|
+
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
|
340
|
+
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
|
341
|
+
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
|
342
|
+
|
343
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
344
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
345
|
+
|
346
|
+
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
347
|
+
const Index rowsAtOnce = 4;
|
348
|
+
const Index peels = 2;
|
349
|
+
const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
350
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
351
|
+
// const Index PeelAlignedMask = RhsPacketSize*peels-1;
|
352
|
+
const Index depth = cols;
|
353
|
+
|
354
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
355
|
+
// Here we assume data are at least aligned on the base scalar type
|
356
|
+
// if that's not the case then vectorization is discarded, see below.
|
357
|
+
Index alignedStart = internal::first_aligned(rhs, depth);
|
358
|
+
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
|
359
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
360
|
+
|
361
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
362
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
363
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
364
|
+
: FirstAligned;
|
365
|
+
|
366
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
367
|
+
const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
|
368
|
+
|
369
|
+
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
370
|
+
Index skipRows = 0;
|
371
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
372
|
+
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
|
373
|
+
{
|
374
|
+
alignedSize = 0;
|
375
|
+
alignedStart = 0;
|
376
|
+
}
|
377
|
+
else if (LhsPacketSize>1)
|
378
|
+
{
|
379
|
+
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
380
|
+
|
381
|
+
while (skipRows<LhsPacketSize &&
|
382
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
|
383
|
+
++skipRows;
|
384
|
+
if (skipRows==LhsPacketSize)
|
385
|
+
{
|
386
|
+
// nothing can be aligned, no need to skip any column
|
387
|
+
alignmentPattern = NoneAligned;
|
388
|
+
skipRows = 0;
|
389
|
+
}
|
390
|
+
else
|
391
|
+
{
|
392
|
+
skipRows = (std::min)(skipRows,Index(rows));
|
393
|
+
// note that the skiped columns are processed later.
|
394
|
+
}
|
395
|
+
eigen_internal_assert( alignmentPattern==NoneAligned
|
396
|
+
|| LhsPacketSize==1
|
397
|
+
|| (skipRows + rowsAtOnce >= rows)
|
398
|
+
|| LhsPacketSize > depth
|
399
|
+
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
|
400
|
+
}
|
401
|
+
else if(Vectorizable)
|
402
|
+
{
|
403
|
+
alignedStart = 0;
|
404
|
+
alignedSize = depth;
|
405
|
+
alignmentPattern = AllAligned;
|
406
|
+
}
|
407
|
+
|
408
|
+
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
409
|
+
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
410
|
+
|
411
|
+
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
412
|
+
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
413
|
+
{
|
414
|
+
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
415
|
+
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
416
|
+
|
417
|
+
// this helps the compiler generating good binary code
|
418
|
+
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
419
|
+
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
420
|
+
|
421
|
+
if (Vectorizable)
|
422
|
+
{
|
423
|
+
/* explicit vectorization */
|
424
|
+
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
425
|
+
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
426
|
+
|
427
|
+
// process initial unaligned coeffs
|
428
|
+
// FIXME this loop get vectorized by the compiler !
|
429
|
+
for (Index j=0; j<alignedStart; ++j)
|
430
|
+
{
|
431
|
+
RhsScalar b = rhs[j];
|
432
|
+
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
433
|
+
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
434
|
+
}
|
435
|
+
|
436
|
+
if (alignedSize>alignedStart)
|
437
|
+
{
|
438
|
+
switch(alignmentPattern)
|
439
|
+
{
|
440
|
+
case AllAligned:
|
441
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
442
|
+
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
443
|
+
break;
|
444
|
+
case EvenAligned:
|
445
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
446
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
447
|
+
break;
|
448
|
+
case FirstAligned:
|
449
|
+
{
|
450
|
+
Index j = alignedStart;
|
451
|
+
if (peels>1)
|
452
|
+
{
|
453
|
+
/* Here we proccess 4 rows with with two peeled iterations to hide
|
454
|
+
* the overhead of unaligned loads. Moreover unaligned loads are handled
|
455
|
+
* using special shift/move operations between the two aligned packets
|
456
|
+
* overlaping the desired unaligned packet. This is *much* more efficient
|
457
|
+
* than basic unaligned loads.
|
458
|
+
*/
|
459
|
+
LhsPacket A01, A02, A03, A11, A12, A13;
|
460
|
+
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
461
|
+
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
462
|
+
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
463
|
+
|
464
|
+
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
465
|
+
{
|
466
|
+
RhsPacket b = pload<RhsPacket>(&rhs[j]);
|
467
|
+
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
468
|
+
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
469
|
+
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
470
|
+
|
471
|
+
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
|
472
|
+
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
473
|
+
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
474
|
+
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
475
|
+
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
476
|
+
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
477
|
+
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
478
|
+
|
479
|
+
b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
|
480
|
+
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
|
481
|
+
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
482
|
+
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
483
|
+
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
484
|
+
}
|
485
|
+
}
|
486
|
+
for (; j<alignedSize; j+=RhsPacketSize)
|
487
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
488
|
+
break;
|
489
|
+
}
|
490
|
+
default:
|
491
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
492
|
+
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
493
|
+
break;
|
494
|
+
}
|
495
|
+
tmp0 += predux(ptmp0);
|
496
|
+
tmp1 += predux(ptmp1);
|
497
|
+
tmp2 += predux(ptmp2);
|
498
|
+
tmp3 += predux(ptmp3);
|
499
|
+
}
|
500
|
+
} // end explicit vectorization
|
501
|
+
|
502
|
+
// process remaining coeffs (or all if no explicit vectorization)
|
503
|
+
// FIXME this loop get vectorized by the compiler !
|
504
|
+
for (Index j=alignedSize; j<depth; ++j)
|
505
|
+
{
|
506
|
+
RhsScalar b = rhs[j];
|
507
|
+
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
508
|
+
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
509
|
+
}
|
510
|
+
res[i*resIncr] += alpha*tmp0;
|
511
|
+
res[(i+offset1)*resIncr] += alpha*tmp1;
|
512
|
+
res[(i+2)*resIncr] += alpha*tmp2;
|
513
|
+
res[(i+offset3)*resIncr] += alpha*tmp3;
|
514
|
+
}
|
515
|
+
|
516
|
+
// process remaining first and last rows (at most columnsAtOnce-1)
|
517
|
+
Index end = rows;
|
518
|
+
Index start = rowBound;
|
519
|
+
do
|
520
|
+
{
|
521
|
+
for (Index i=start; i<end; ++i)
|
522
|
+
{
|
523
|
+
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
524
|
+
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
525
|
+
const LhsScalar* lhs0 = lhs + i*lhsStride;
|
526
|
+
// process first unaligned result's coeffs
|
527
|
+
// FIXME this loop get vectorized by the compiler !
|
528
|
+
for (Index j=0; j<alignedStart; ++j)
|
529
|
+
tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
530
|
+
|
531
|
+
if (alignedSize>alignedStart)
|
532
|
+
{
|
533
|
+
// process aligned rhs coeffs
|
534
|
+
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
535
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
536
|
+
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
537
|
+
else
|
538
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
539
|
+
ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
540
|
+
tmp0 += predux(ptmp0);
|
541
|
+
}
|
542
|
+
|
543
|
+
// process remaining scalars
|
544
|
+
// FIXME this loop get vectorized by the compiler !
|
545
|
+
for (Index j=alignedSize; j<depth; ++j)
|
546
|
+
tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
547
|
+
res[i*resIncr] += alpha*tmp0;
|
548
|
+
}
|
549
|
+
if (skipRows)
|
550
|
+
{
|
551
|
+
start = 0;
|
552
|
+
end = skipRows;
|
553
|
+
skipRows = 0;
|
554
|
+
}
|
555
|
+
else
|
556
|
+
break;
|
557
|
+
} while(Vectorizable);
|
558
|
+
|
559
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
560
|
+
}
|
561
|
+
|
562
|
+
} // end namespace internal
|
563
|
+
|
564
|
+
} // end namespace Eigen
|
565
|
+
|
566
|
+
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|