ruby-eigen 0.0.9 → 0.0.10.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +22 -0
- data/README.md +21 -0
- data/ext/eigen/eigen3/COPYING.BSD +26 -0
- data/ext/eigen/eigen3/COPYING.MPL2 +373 -0
- data/ext/eigen/eigen3/COPYING.README +18 -0
- data/ext/eigen/eigen3/Eigen/Array +11 -0
- data/ext/eigen/eigen3/Eigen/Cholesky +32 -0
- data/ext/eigen/eigen3/Eigen/CholmodSupport +45 -0
- data/ext/eigen/eigen3/Eigen/Core +376 -0
- data/ext/eigen/eigen3/Eigen/Dense +7 -0
- data/ext/eigen/eigen3/Eigen/Eigen +2 -0
- data/ext/eigen/eigen3/Eigen/Eigen2Support +95 -0
- data/ext/eigen/eigen3/Eigen/Eigenvalues +48 -0
- data/ext/eigen/eigen3/Eigen/Geometry +63 -0
- data/ext/eigen/eigen3/Eigen/Householder +23 -0
- data/ext/eigen/eigen3/Eigen/IterativeLinearSolvers +40 -0
- data/ext/eigen/eigen3/Eigen/Jacobi +26 -0
- data/ext/eigen/eigen3/Eigen/LU +41 -0
- data/ext/eigen/eigen3/Eigen/LeastSquares +32 -0
- data/ext/eigen/eigen3/Eigen/MetisSupport +28 -0
- data/ext/eigen/eigen3/Eigen/PaStiXSupport +46 -0
- data/ext/eigen/eigen3/Eigen/PardisoSupport +30 -0
- data/ext/eigen/eigen3/Eigen/QR +45 -0
- data/ext/eigen/eigen3/Eigen/QtAlignedMalloc +34 -0
- data/ext/eigen/eigen3/Eigen/SPQRSupport +29 -0
- data/ext/eigen/eigen3/Eigen/SVD +37 -0
- data/ext/eigen/eigen3/Eigen/Sparse +27 -0
- data/ext/eigen/eigen3/Eigen/SparseCore +64 -0
- data/ext/eigen/eigen3/Eigen/SparseLU +49 -0
- data/ext/eigen/eigen3/Eigen/SparseQR +33 -0
- data/ext/eigen/eigen3/Eigen/StdDeque +27 -0
- data/ext/eigen/eigen3/Eigen/StdList +26 -0
- data/ext/eigen/eigen3/Eigen/StdVector +27 -0
- data/ext/eigen/eigen3/Eigen/SuperLUSupport +59 -0
- data/ext/eigen/eigen3/Eigen/UmfPackSupport +36 -0
- data/ext/eigen/eigen3/Eigen/src/Cholesky/LDLT.h +611 -0
- data/ext/eigen/eigen3/Eigen/src/Cholesky/LLT.h +498 -0
- data/ext/eigen/eigen3/Eigen/src/Cholesky/LLT_MKL.h +102 -0
- data/ext/eigen/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h +607 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Array.h +323 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ArrayBase.h +226 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ArrayWrapper.h +264 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Assign.h +590 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Assign_MKL.h +224 -0
- data/ext/eigen/eigen3/Eigen/src/Core/BandMatrix.h +334 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Block.h +406 -0
- data/ext/eigen/eigen3/Eigen/src/Core/BooleanRedux.h +154 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CommaInitializer.h +154 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CoreIterators.h +61 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseBinaryOp.h +230 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseNullaryOp.h +864 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseUnaryOp.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/Core/CwiseUnaryView.h +139 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DenseBase.h +521 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DenseCoeffsBase.h +754 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DenseStorage.h +434 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Diagonal.h +237 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DiagonalMatrix.h +313 -0
- data/ext/eigen/eigen3/Eigen/src/Core/DiagonalProduct.h +131 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Dot.h +263 -0
- data/ext/eigen/eigen3/Eigen/src/Core/EigenBase.h +131 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Flagged.h +140 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ForceAlignedAccess.h +146 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Functors.h +1026 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Fuzzy.h +150 -0
- data/ext/eigen/eigen3/Eigen/src/Core/GeneralProduct.h +635 -0
- data/ext/eigen/eigen3/Eigen/src/Core/GenericPacketMath.h +350 -0
- data/ext/eigen/eigen3/Eigen/src/Core/GlobalFunctions.h +92 -0
- data/ext/eigen/eigen3/Eigen/src/Core/IO.h +250 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Map.h +192 -0
- data/ext/eigen/eigen3/Eigen/src/Core/MapBase.h +247 -0
- data/ext/eigen/eigen3/Eigen/src/Core/MathFunctions.h +768 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Matrix.h +420 -0
- data/ext/eigen/eigen3/Eigen/src/Core/MatrixBase.h +563 -0
- data/ext/eigen/eigen3/Eigen/src/Core/NestByValue.h +111 -0
- data/ext/eigen/eigen3/Eigen/src/Core/NoAlias.h +134 -0
- data/ext/eigen/eigen3/Eigen/src/Core/NumTraits.h +150 -0
- data/ext/eigen/eigen3/Eigen/src/Core/PermutationMatrix.h +721 -0
- data/ext/eigen/eigen3/Eigen/src/Core/PlainObjectBase.h +822 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ProductBase.h +290 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Random.h +152 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Redux.h +409 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Ref.h +278 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Replicate.h +177 -0
- data/ext/eigen/eigen3/Eigen/src/Core/ReturnByValue.h +99 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Reverse.h +224 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Select.h +162 -0
- data/ext/eigen/eigen3/Eigen/src/Core/SelfAdjointView.h +314 -0
- data/ext/eigen/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h +191 -0
- data/ext/eigen/eigen3/Eigen/src/Core/SolveTriangular.h +260 -0
- data/ext/eigen/eigen3/Eigen/src/Core/StableNorm.h +203 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Stride.h +108 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Swap.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Transpose.h +419 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Transpositions.h +436 -0
- data/ext/eigen/eigen3/Eigen/src/Core/TriangularMatrix.h +839 -0
- data/ext/eigen/eigen3/Eigen/src/Core/VectorBlock.h +95 -0
- data/ext/eigen/eigen3/Eigen/src/Core/VectorwiseOp.h +642 -0
- data/ext/eigen/eigen3/Eigen/src/Core/Visitor.h +237 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h +217 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h +501 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/Default/Settings.h +49 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/NEON/Complex.h +253 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h +420 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/SSE/Complex.h +442 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +475 -0
- data/ext/eigen/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +649 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h +476 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1341 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h +427 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +278 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +146 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +118 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h +566 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +131 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/Parallelizer.h +162 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +436 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +295 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h +281 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +114 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointProduct.h +123 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h +427 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +309 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h +348 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +247 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h +332 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +155 -0
- data/ext/eigen/eigen3/Eigen/src/Core/products/TriangularSolverVector.h +139 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/BlasUtil.h +264 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Constants.h +451 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h +40 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/MKL_support.h +158 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Macros.h +451 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Memory.h +977 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/Meta.h +243 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/NonMPL2.h +3 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h +14 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/StaticAssert.h +208 -0
- data/ext/eigen/eigen3/Eigen/src/Core/util/XprHelper.h +469 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Block.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Cwise.h +192 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/CwiseOperators.h +298 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/AlignedBox.h +159 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/All.h +115 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/AngleAxis.h +214 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Hyperplane.h +254 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h +141 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Quaternion.h +495 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Rotation2D.h +145 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/RotationBase.h +123 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Scaling.h +167 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Transform.h +786 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Geometry/Translation.h +184 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/LU.h +120 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Lazy.h +71 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/LeastSquares.h +169 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Macros.h +20 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/MathFunctions.h +57 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Memory.h +45 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Meta.h +75 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/Minor.h +117 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/QR.h +67 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/SVD.h +637 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/TriangularSolver.h +42 -0
- data/ext/eigen/eigen3/Eigen/src/Eigen2Support/VectorBlock.h +94 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h +341 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h +456 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/ComplexSchur_MKL.h +94 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/EigenSolver.h +607 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +350 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +227 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h +373 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +160 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/RealQZ.h +624 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/RealSchur.h +525 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/RealSchur_MKL.h +83 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +801 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h +92 -0
- data/ext/eigen/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h +557 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/AlignedBox.h +392 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/AngleAxis.h +233 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/EulerAngles.h +104 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Homogeneous.h +307 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Hyperplane.h +280 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/OrthoMethods.h +218 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/ParametrizedLine.h +195 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Quaternion.h +776 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Rotation2D.h +160 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/RotationBase.h +206 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Scaling.h +166 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Transform.h +1455 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Translation.h +206 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/Umeyama.h +177 -0
- data/ext/eigen/eigen3/Eigen/src/Geometry/arch/Geometry_SSE.h +115 -0
- data/ext/eigen/eigen3/Eigen/src/Householder/BlockHouseholder.h +68 -0
- data/ext/eigen/eigen3/Eigen/src/Householder/Householder.h +171 -0
- data/ext/eigen/eigen3/Eigen/src/Householder/HouseholderSequence.h +441 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +263 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +256 -0
- data/ext/eigen/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +282 -0
- data/ext/eigen/eigen3/Eigen/src/Jacobi/Jacobi.h +433 -0
- data/ext/eigen/eigen3/Eigen/src/LU/Determinant.h +101 -0
- data/ext/eigen/eigen3/Eigen/src/LU/FullPivLU.h +751 -0
- data/ext/eigen/eigen3/Eigen/src/LU/Inverse.h +400 -0
- data/ext/eigen/eigen3/Eigen/src/LU/PartialPivLU.h +509 -0
- data/ext/eigen/eigen3/Eigen/src/LU/PartialPivLU_MKL.h +85 -0
- data/ext/eigen/eigen3/Eigen/src/LU/arch/Inverse_SSE.h +329 -0
- data/ext/eigen/eigen3/Eigen/src/MetisSupport/MetisSupport.h +137 -0
- data/ext/eigen/eigen3/Eigen/src/OrderingMethods/Amd.h +444 -0
- data/ext/eigen/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h +1850 -0
- data/ext/eigen/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h +721 -0
- data/ext/eigen/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h +592 -0
- data/ext/eigen/eigen3/Eigen/src/QR/ColPivHouseholderQR.h +580 -0
- data/ext/eigen/eigen3/Eigen/src/QR/ColPivHouseholderQR_MKL.h +99 -0
- data/ext/eigen/eigen3/Eigen/src/QR/FullPivHouseholderQR.h +622 -0
- data/ext/eigen/eigen3/Eigen/src/QR/HouseholderQR.h +388 -0
- data/ext/eigen/eigen3/Eigen/src/QR/HouseholderQR_MKL.h +71 -0
- data/ext/eigen/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +338 -0
- data/ext/eigen/eigen3/Eigen/src/SVD/JacobiSVD.h +976 -0
- data/ext/eigen/eigen3/Eigen/src/SVD/JacobiSVD_MKL.h +92 -0
- data/ext/eigen/eigen3/Eigen/src/SVD/UpperBidiagonalization.h +148 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h +671 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/AmbiVector.h +373 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/CompressedStorage.h +233 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +245 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h +181 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseBlock.h +537 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseColEtree.h +206 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +325 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +163 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +311 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h +196 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseDot.h +101 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseFuzzy.h +26 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseMatrix.h +1262 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h +461 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparsePermutation.h +148 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseProduct.h +188 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseRedux.h +45 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h +507 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +150 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseTranspose.h +63 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseTriangularView.h +179 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseUtil.h +172 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseVector.h +448 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/SparseView.h +99 -0
- data/ext/eigen/eigen3/Eigen/src/SparseCore/TriangularSolver.h +334 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU.h +806 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h +227 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h +111 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +298 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h +180 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h +177 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +106 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +279 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +127 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h +135 -0
- data/ext/eigen/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
- data/ext/eigen/eigen3/Eigen/src/SparseQR/SparseQR.h +714 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/StdDeque.h +134 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/StdList.h +114 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/StdVector.h +126 -0
- data/ext/eigen/eigen3/Eigen/src/StlSupport/details.h +84 -0
- data/ext/eigen/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h +1026 -0
- data/ext/eigen/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h +474 -0
- data/ext/eigen/eigen3/Eigen/src/misc/Image.h +84 -0
- data/ext/eigen/eigen3/Eigen/src/misc/Kernel.h +81 -0
- data/ext/eigen/eigen3/Eigen/src/misc/Solve.h +76 -0
- data/ext/eigen/eigen3/Eigen/src/misc/SparseSolve.h +128 -0
- data/ext/eigen/eigen3/Eigen/src/misc/blas.h +658 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h +253 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h +187 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/BlockMethods.h +935 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h +46 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h +172 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h +143 -0
- data/ext/eigen/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h +52 -0
- data/ext/eigen/eigen3/signature_of_eigen3_matrix_library +1 -0
- data/ext/eigen/eigen_wrap.cxx +19420 -10396
- data/ext/eigen/extconf.rb +37 -2
- data/lib/eigen.rb +146 -3
- metadata +294 -7
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Copyright (c) 2011, Intel Corporation. All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
5
|
+
are permitted provided that the following conditions are met:
|
|
6
|
+
|
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
8
|
+
list of conditions and the following disclaimer.
|
|
9
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
|
11
|
+
and/or other materials provided with the distribution.
|
|
12
|
+
* Neither the name of Intel Corporation nor the names of its contributors may
|
|
13
|
+
be used to endorse or promote products derived from this software without
|
|
14
|
+
specific prior written permission.
|
|
15
|
+
|
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
20
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
23
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
|
|
27
|
+
********************************************************************************
|
|
28
|
+
* Content : Eigen bindings to Intel(R) MKL
|
|
29
|
+
* Level 3 BLAS SYRK/HERK implementation.
|
|
30
|
+
********************************************************************************
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
|
|
34
|
+
#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
|
|
35
|
+
|
|
36
|
+
namespace Eigen {
|
|
37
|
+
|
|
38
|
+
namespace internal {
|
|
39
|
+
|
|
40
|
+
template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
|
|
41
|
+
struct general_matrix_matrix_rankupdate :
|
|
42
|
+
general_matrix_matrix_triangular_product<
|
|
43
|
+
Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
// try to go to BLAS specialization
|
|
47
|
+
#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
|
|
48
|
+
template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
|
|
49
|
+
int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
|
|
50
|
+
struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
|
|
51
|
+
Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
|
|
52
|
+
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
|
|
53
|
+
const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
|
|
54
|
+
{ \
|
|
55
|
+
if (lhs==rhs) { \
|
|
56
|
+
general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
|
|
57
|
+
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
|
|
58
|
+
} else { \
|
|
59
|
+
general_matrix_matrix_triangular_product<Index, \
|
|
60
|
+
Scalar, LhsStorageOrder, ConjugateLhs, \
|
|
61
|
+
Scalar, RhsStorageOrder, ConjugateRhs, \
|
|
62
|
+
ColMajor, UpLo, BuiltIn> \
|
|
63
|
+
::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
|
|
64
|
+
} \
|
|
65
|
+
} \
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
|
|
69
|
+
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
|
|
70
|
+
EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
|
|
71
|
+
//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
|
|
72
|
+
|
|
73
|
+
// SYRK for float/double
|
|
74
|
+
#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
|
|
75
|
+
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
|
|
76
|
+
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
|
|
77
|
+
enum { \
|
|
78
|
+
IsLower = (UpLo&Lower) == Lower, \
|
|
79
|
+
LowUp = IsLower ? Lower : Upper, \
|
|
80
|
+
conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
|
|
81
|
+
}; \
|
|
82
|
+
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
|
|
83
|
+
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
|
|
84
|
+
{ \
|
|
85
|
+
/* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
|
|
86
|
+
\
|
|
87
|
+
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
|
|
88
|
+
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
|
|
89
|
+
MKLTYPE alpha_, beta_; \
|
|
90
|
+
\
|
|
91
|
+
/* Set alpha_ & beta_ */ \
|
|
92
|
+
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
|
|
93
|
+
assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
|
|
94
|
+
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
|
|
95
|
+
} \
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
// HERK for complex data
|
|
99
|
+
#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
|
|
100
|
+
template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
|
|
101
|
+
struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
|
|
102
|
+
enum { \
|
|
103
|
+
IsLower = (UpLo&Lower) == Lower, \
|
|
104
|
+
LowUp = IsLower ? Lower : Upper, \
|
|
105
|
+
conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
|
|
106
|
+
}; \
|
|
107
|
+
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
|
|
108
|
+
const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
|
|
109
|
+
{ \
|
|
110
|
+
typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
|
|
111
|
+
\
|
|
112
|
+
MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
|
|
113
|
+
char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
|
|
114
|
+
RTYPE alpha_, beta_; \
|
|
115
|
+
const EIGTYPE* a_ptr; \
|
|
116
|
+
\
|
|
117
|
+
/* Set alpha_ & beta_ */ \
|
|
118
|
+
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
|
|
119
|
+
/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
|
|
120
|
+
alpha_ = alpha.real(); \
|
|
121
|
+
beta_ = 1.0; \
|
|
122
|
+
/* Copy with conjugation in some cases*/ \
|
|
123
|
+
MatrixType a; \
|
|
124
|
+
if (conjA) { \
|
|
125
|
+
Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
|
|
126
|
+
a = mapA.conjugate(); \
|
|
127
|
+
lda = a.outerStride(); \
|
|
128
|
+
a_ptr = a.data(); \
|
|
129
|
+
} else a_ptr=lhs; \
|
|
130
|
+
MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
|
|
131
|
+
} \
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
|
|
136
|
+
EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk)
|
|
137
|
+
|
|
138
|
+
//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
|
|
139
|
+
//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
} // end namespace internal
|
|
143
|
+
|
|
144
|
+
} // end namespace Eigen
|
|
145
|
+
|
|
146
|
+
#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Copyright (c) 2011, Intel Corporation. All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
5
|
+
are permitted provided that the following conditions are met:
|
|
6
|
+
|
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
8
|
+
list of conditions and the following disclaimer.
|
|
9
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
|
11
|
+
and/or other materials provided with the distribution.
|
|
12
|
+
* Neither the name of Intel Corporation nor the names of its contributors may
|
|
13
|
+
be used to endorse or promote products derived from this software without
|
|
14
|
+
specific prior written permission.
|
|
15
|
+
|
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
20
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
23
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
|
|
27
|
+
********************************************************************************
|
|
28
|
+
* Content : Eigen bindings to Intel(R) MKL
|
|
29
|
+
* General matrix-matrix product functionality based on ?GEMM.
|
|
30
|
+
********************************************************************************
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
|
|
34
|
+
#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
|
|
35
|
+
|
|
36
|
+
namespace Eigen {
|
|
37
|
+
|
|
38
|
+
namespace internal {
|
|
39
|
+
|
|
40
|
+
/**********************************************************************
|
|
41
|
+
* This file implements general matrix-matrix multiplication using BLAS
|
|
42
|
+
* gemm function via partial specialization of
|
|
43
|
+
* general_matrix_matrix_product::run(..) method for float, double,
|
|
44
|
+
* std::complex<float> and std::complex<double> types
|
|
45
|
+
**********************************************************************/
|
|
46
|
+
|
|
47
|
+
// gemm specialization
|
|
48
|
+
|
|
49
|
+
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
|
|
50
|
+
template< \
|
|
51
|
+
typename Index, \
|
|
52
|
+
int LhsStorageOrder, bool ConjugateLhs, \
|
|
53
|
+
int RhsStorageOrder, bool ConjugateRhs> \
|
|
54
|
+
struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
|
|
55
|
+
{ \
|
|
56
|
+
static void run(Index rows, Index cols, Index depth, \
|
|
57
|
+
const EIGTYPE* _lhs, Index lhsStride, \
|
|
58
|
+
const EIGTYPE* _rhs, Index rhsStride, \
|
|
59
|
+
EIGTYPE* res, Index resStride, \
|
|
60
|
+
EIGTYPE alpha, \
|
|
61
|
+
level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
|
|
62
|
+
GemmParallelInfo<Index>* /*info = 0*/) \
|
|
63
|
+
{ \
|
|
64
|
+
using std::conj; \
|
|
65
|
+
\
|
|
66
|
+
char transa, transb; \
|
|
67
|
+
MKL_INT m, n, k, lda, ldb, ldc; \
|
|
68
|
+
const EIGTYPE *a, *b; \
|
|
69
|
+
MKLTYPE alpha_, beta_; \
|
|
70
|
+
MatrixX##EIGPREFIX a_tmp, b_tmp; \
|
|
71
|
+
EIGTYPE myone(1);\
|
|
72
|
+
\
|
|
73
|
+
/* Set transpose options */ \
|
|
74
|
+
transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
|
|
75
|
+
transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
|
|
76
|
+
\
|
|
77
|
+
/* Set m, n, k */ \
|
|
78
|
+
m = (MKL_INT)rows; \
|
|
79
|
+
n = (MKL_INT)cols; \
|
|
80
|
+
k = (MKL_INT)depth; \
|
|
81
|
+
\
|
|
82
|
+
/* Set alpha_ & beta_ */ \
|
|
83
|
+
assign_scalar_eig2mkl(alpha_, alpha); \
|
|
84
|
+
assign_scalar_eig2mkl(beta_, myone); \
|
|
85
|
+
\
|
|
86
|
+
/* Set lda, ldb, ldc */ \
|
|
87
|
+
lda = (MKL_INT)lhsStride; \
|
|
88
|
+
ldb = (MKL_INT)rhsStride; \
|
|
89
|
+
ldc = (MKL_INT)resStride; \
|
|
90
|
+
\
|
|
91
|
+
/* Set a, b, c */ \
|
|
92
|
+
if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
|
|
93
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
|
|
94
|
+
a_tmp = lhs.conjugate(); \
|
|
95
|
+
a = a_tmp.data(); \
|
|
96
|
+
lda = a_tmp.outerStride(); \
|
|
97
|
+
} else a = _lhs; \
|
|
98
|
+
\
|
|
99
|
+
if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
|
|
100
|
+
Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
|
|
101
|
+
b_tmp = rhs.conjugate(); \
|
|
102
|
+
b = b_tmp.data(); \
|
|
103
|
+
ldb = b_tmp.outerStride(); \
|
|
104
|
+
} else b = _rhs; \
|
|
105
|
+
\
|
|
106
|
+
MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
|
|
107
|
+
}};
|
|
108
|
+
|
|
109
|
+
GEMM_SPECIALIZATION(double, d, double, d)
|
|
110
|
+
GEMM_SPECIALIZATION(float, f, float, s)
|
|
111
|
+
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
|
|
112
|
+
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c)
|
|
113
|
+
|
|
114
|
+
} // end namespase internal
|
|
115
|
+
|
|
116
|
+
} // end namespace Eigen
|
|
117
|
+
|
|
118
|
+
#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
|
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
|
2
|
+
// for linear algebra.
|
|
3
|
+
//
|
|
4
|
+
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
|
|
5
|
+
//
|
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
9
|
+
|
|
10
|
+
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
11
|
+
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
12
|
+
|
|
13
|
+
namespace Eigen {
|
|
14
|
+
|
|
15
|
+
namespace internal {
|
|
16
|
+
|
|
17
|
+
/* Optimized col-major matrix * vector product:
|
|
18
|
+
* This algorithm processes 4 columns at onces that allows to both reduce
|
|
19
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
|
20
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
|
21
|
+
* same alignment pattern.
|
|
22
|
+
*
|
|
23
|
+
* Mixing type logic: C += alpha * A * B
|
|
24
|
+
* | A | B |alpha| comments
|
|
25
|
+
* |real |cplx |cplx | no vectorization
|
|
26
|
+
* |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
|
|
27
|
+
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
|
28
|
+
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
|
29
|
+
*/
|
|
30
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
|
31
|
+
struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
|
32
|
+
{
|
|
33
|
+
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
34
|
+
|
|
35
|
+
enum {
|
|
36
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
|
37
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
|
38
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
39
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
40
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
44
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
45
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
46
|
+
|
|
47
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
48
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
49
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
50
|
+
|
|
51
|
+
EIGEN_DONT_INLINE static void run(
|
|
52
|
+
Index rows, Index cols,
|
|
53
|
+
const LhsScalar* lhs, Index lhsStride,
|
|
54
|
+
const RhsScalar* rhs, Index rhsIncr,
|
|
55
|
+
ResScalar* res, Index resIncr, RhsScalar alpha);
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
|
59
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
|
60
|
+
Index rows, Index cols,
|
|
61
|
+
const LhsScalar* lhs, Index lhsStride,
|
|
62
|
+
const RhsScalar* rhs, Index rhsIncr,
|
|
63
|
+
ResScalar* res, Index resIncr, RhsScalar alpha)
|
|
64
|
+
{
|
|
65
|
+
EIGEN_UNUSED_VARIABLE(resIncr)
|
|
66
|
+
eigen_internal_assert(resIncr==1);
|
|
67
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
|
68
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
69
|
+
#endif
|
|
70
|
+
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
|
|
71
|
+
pstore(&res[j], \
|
|
72
|
+
padd(pload<ResPacket>(&res[j]), \
|
|
73
|
+
padd( \
|
|
74
|
+
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
|
|
75
|
+
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
|
|
76
|
+
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
|
|
77
|
+
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
|
|
78
|
+
|
|
79
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
80
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
81
|
+
if(ConjugateRhs)
|
|
82
|
+
alpha = numext::conj(alpha);
|
|
83
|
+
|
|
84
|
+
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
|
85
|
+
const Index columnsAtOnce = 4;
|
|
86
|
+
const Index peels = 2;
|
|
87
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
88
|
+
const Index ResPacketAlignedMask = ResPacketSize-1;
|
|
89
|
+
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
|
90
|
+
const Index size = rows;
|
|
91
|
+
|
|
92
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
|
93
|
+
// Here we assume data are at least aligned on the base scalar type.
|
|
94
|
+
Index alignedStart = internal::first_aligned(res,size);
|
|
95
|
+
Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
|
|
96
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
|
97
|
+
|
|
98
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
|
99
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
|
100
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
|
101
|
+
: FirstAligned;
|
|
102
|
+
|
|
103
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
|
104
|
+
const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
|
|
105
|
+
|
|
106
|
+
// find how many columns do we have to skip to be aligned with the result (if possible)
|
|
107
|
+
Index skipColumns = 0;
|
|
108
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
|
109
|
+
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
|
|
110
|
+
{
|
|
111
|
+
alignedSize = 0;
|
|
112
|
+
alignedStart = 0;
|
|
113
|
+
}
|
|
114
|
+
else if (LhsPacketSize>1)
|
|
115
|
+
{
|
|
116
|
+
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
|
117
|
+
|
|
118
|
+
while (skipColumns<LhsPacketSize &&
|
|
119
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
|
120
|
+
++skipColumns;
|
|
121
|
+
if (skipColumns==LhsPacketSize)
|
|
122
|
+
{
|
|
123
|
+
// nothing can be aligned, no need to skip any column
|
|
124
|
+
alignmentPattern = NoneAligned;
|
|
125
|
+
skipColumns = 0;
|
|
126
|
+
}
|
|
127
|
+
else
|
|
128
|
+
{
|
|
129
|
+
skipColumns = (std::min)(skipColumns,cols);
|
|
130
|
+
// note that the skiped columns are processed later.
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
eigen_internal_assert( (alignmentPattern==NoneAligned)
|
|
134
|
+
|| (skipColumns + columnsAtOnce >= cols)
|
|
135
|
+
|| LhsPacketSize > size
|
|
136
|
+
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
|
|
137
|
+
}
|
|
138
|
+
else if(Vectorizable)
|
|
139
|
+
{
|
|
140
|
+
alignedStart = 0;
|
|
141
|
+
alignedSize = size;
|
|
142
|
+
alignmentPattern = AllAligned;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
|
146
|
+
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
|
147
|
+
|
|
148
|
+
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
|
149
|
+
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
|
150
|
+
{
|
|
151
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
|
|
152
|
+
ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
|
|
153
|
+
ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
|
|
154
|
+
ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
|
|
155
|
+
|
|
156
|
+
// this helps a lot generating better binary code
|
|
157
|
+
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
|
158
|
+
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
|
159
|
+
|
|
160
|
+
if (Vectorizable)
|
|
161
|
+
{
|
|
162
|
+
/* explicit vectorization */
|
|
163
|
+
// process initial unaligned coeffs
|
|
164
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
165
|
+
{
|
|
166
|
+
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
|
167
|
+
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
|
168
|
+
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
|
169
|
+
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
if (alignedSize>alignedStart)
|
|
173
|
+
{
|
|
174
|
+
switch(alignmentPattern)
|
|
175
|
+
{
|
|
176
|
+
case AllAligned:
|
|
177
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
178
|
+
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
|
179
|
+
break;
|
|
180
|
+
case EvenAligned:
|
|
181
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
182
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
|
183
|
+
break;
|
|
184
|
+
case FirstAligned:
|
|
185
|
+
{
|
|
186
|
+
Index j = alignedStart;
|
|
187
|
+
if(peels>1)
|
|
188
|
+
{
|
|
189
|
+
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
|
190
|
+
ResPacket T0, T1;
|
|
191
|
+
|
|
192
|
+
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
|
193
|
+
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
|
194
|
+
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
|
195
|
+
|
|
196
|
+
for (; j<peeledSize; j+=peels*ResPacketSize)
|
|
197
|
+
{
|
|
198
|
+
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
|
199
|
+
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
|
200
|
+
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
|
201
|
+
|
|
202
|
+
A00 = pload<LhsPacket>(&lhs0[j]);
|
|
203
|
+
A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
|
|
204
|
+
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
|
205
|
+
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
|
206
|
+
|
|
207
|
+
T0 = pcj.pmadd(A01, ptmp1, T0);
|
|
208
|
+
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
|
209
|
+
T0 = pcj.pmadd(A02, ptmp2, T0);
|
|
210
|
+
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
|
211
|
+
T0 = pcj.pmadd(A03, ptmp3, T0);
|
|
212
|
+
pstore(&res[j],T0);
|
|
213
|
+
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
|
214
|
+
T1 = pcj.pmadd(A11, ptmp1, T1);
|
|
215
|
+
T1 = pcj.pmadd(A12, ptmp2, T1);
|
|
216
|
+
T1 = pcj.pmadd(A13, ptmp3, T1);
|
|
217
|
+
pstore(&res[j+ResPacketSize],T1);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
for (; j<alignedSize; j+=ResPacketSize)
|
|
221
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
|
222
|
+
break;
|
|
223
|
+
}
|
|
224
|
+
default:
|
|
225
|
+
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
226
|
+
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
} // end explicit vectorization
|
|
231
|
+
|
|
232
|
+
/* process remaining coeffs (or all if there is no explicit vectorization) */
|
|
233
|
+
for (Index j=alignedSize; j<size; ++j)
|
|
234
|
+
{
|
|
235
|
+
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
|
236
|
+
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
|
237
|
+
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
|
238
|
+
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// process remaining first and last columns (at most columnsAtOnce-1)
|
|
243
|
+
Index end = cols;
|
|
244
|
+
Index start = columnBound;
|
|
245
|
+
do
|
|
246
|
+
{
|
|
247
|
+
for (Index k=start; k<end; ++k)
|
|
248
|
+
{
|
|
249
|
+
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
|
|
250
|
+
const LhsScalar* lhs0 = lhs + k*lhsStride;
|
|
251
|
+
|
|
252
|
+
if (Vectorizable)
|
|
253
|
+
{
|
|
254
|
+
/* explicit vectorization */
|
|
255
|
+
// process first unaligned result's coeffs
|
|
256
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
257
|
+
res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
|
|
258
|
+
// process aligned result's coeffs
|
|
259
|
+
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
|
260
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
261
|
+
pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
|
262
|
+
else
|
|
263
|
+
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
264
|
+
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// process remaining scalars (or all if no explicit vectorization)
|
|
268
|
+
for (Index i=alignedSize; i<size; ++i)
|
|
269
|
+
res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
|
|
270
|
+
}
|
|
271
|
+
if (skipColumns)
|
|
272
|
+
{
|
|
273
|
+
start = 0;
|
|
274
|
+
end = skipColumns;
|
|
275
|
+
skipColumns = 0;
|
|
276
|
+
}
|
|
277
|
+
else
|
|
278
|
+
break;
|
|
279
|
+
} while(Vectorizable);
|
|
280
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/* Optimized row-major matrix * vector product:
|
|
284
|
+
* This algorithm processes 4 rows at onces that allows to both reduce
|
|
285
|
+
* the number of load/stores of the result by a factor 4 and to reduce
|
|
286
|
+
* the instruction dependency. Moreover, we know that all bands have the
|
|
287
|
+
* same alignment pattern.
|
|
288
|
+
*
|
|
289
|
+
* Mixing type logic:
|
|
290
|
+
* - alpha is always a complex (or converted to a complex)
|
|
291
|
+
* - no vectorization
|
|
292
|
+
*/
|
|
293
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
|
294
|
+
struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
|
295
|
+
{
|
|
296
|
+
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
297
|
+
|
|
298
|
+
enum {
|
|
299
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
|
300
|
+
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
|
301
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
302
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
303
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
304
|
+
};
|
|
305
|
+
|
|
306
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
307
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
308
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
309
|
+
|
|
310
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
311
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
312
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
313
|
+
|
|
314
|
+
EIGEN_DONT_INLINE static void run(
|
|
315
|
+
Index rows, Index cols,
|
|
316
|
+
const LhsScalar* lhs, Index lhsStride,
|
|
317
|
+
const RhsScalar* rhs, Index rhsIncr,
|
|
318
|
+
ResScalar* res, Index resIncr,
|
|
319
|
+
ResScalar alpha);
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
|
323
|
+
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
|
324
|
+
Index rows, Index cols,
|
|
325
|
+
const LhsScalar* lhs, Index lhsStride,
|
|
326
|
+
const RhsScalar* rhs, Index rhsIncr,
|
|
327
|
+
ResScalar* res, Index resIncr,
|
|
328
|
+
ResScalar alpha)
|
|
329
|
+
{
|
|
330
|
+
EIGEN_UNUSED_VARIABLE(rhsIncr);
|
|
331
|
+
eigen_internal_assert(rhsIncr==1);
|
|
332
|
+
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
|
333
|
+
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
334
|
+
#endif
|
|
335
|
+
|
|
336
|
+
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
|
|
337
|
+
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
|
|
338
|
+
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
|
|
339
|
+
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
|
|
340
|
+
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
|
|
341
|
+
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
|
|
342
|
+
|
|
343
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
344
|
+
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
345
|
+
|
|
346
|
+
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
|
347
|
+
const Index rowsAtOnce = 4;
|
|
348
|
+
const Index peels = 2;
|
|
349
|
+
const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
|
350
|
+
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
351
|
+
// const Index PeelAlignedMask = RhsPacketSize*peels-1;
|
|
352
|
+
const Index depth = cols;
|
|
353
|
+
|
|
354
|
+
// How many coeffs of the result do we have to skip to be aligned.
|
|
355
|
+
// Here we assume data are at least aligned on the base scalar type
|
|
356
|
+
// if that's not the case then vectorization is discarded, see below.
|
|
357
|
+
Index alignedStart = internal::first_aligned(rhs, depth);
|
|
358
|
+
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
|
|
359
|
+
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
|
360
|
+
|
|
361
|
+
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
|
362
|
+
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
|
363
|
+
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
|
364
|
+
: FirstAligned;
|
|
365
|
+
|
|
366
|
+
// we cannot assume the first element is aligned because of sub-matrices
|
|
367
|
+
const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
|
|
368
|
+
|
|
369
|
+
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
|
370
|
+
Index skipRows = 0;
|
|
371
|
+
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
|
372
|
+
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
|
|
373
|
+
{
|
|
374
|
+
alignedSize = 0;
|
|
375
|
+
alignedStart = 0;
|
|
376
|
+
}
|
|
377
|
+
else if (LhsPacketSize>1)
|
|
378
|
+
{
|
|
379
|
+
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
|
380
|
+
|
|
381
|
+
while (skipRows<LhsPacketSize &&
|
|
382
|
+
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
|
|
383
|
+
++skipRows;
|
|
384
|
+
if (skipRows==LhsPacketSize)
|
|
385
|
+
{
|
|
386
|
+
// nothing can be aligned, no need to skip any column
|
|
387
|
+
alignmentPattern = NoneAligned;
|
|
388
|
+
skipRows = 0;
|
|
389
|
+
}
|
|
390
|
+
else
|
|
391
|
+
{
|
|
392
|
+
skipRows = (std::min)(skipRows,Index(rows));
|
|
393
|
+
// note that the skiped columns are processed later.
|
|
394
|
+
}
|
|
395
|
+
eigen_internal_assert( alignmentPattern==NoneAligned
|
|
396
|
+
|| LhsPacketSize==1
|
|
397
|
+
|| (skipRows + rowsAtOnce >= rows)
|
|
398
|
+
|| LhsPacketSize > depth
|
|
399
|
+
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
|
|
400
|
+
}
|
|
401
|
+
else if(Vectorizable)
|
|
402
|
+
{
|
|
403
|
+
alignedStart = 0;
|
|
404
|
+
alignedSize = depth;
|
|
405
|
+
alignmentPattern = AllAligned;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
|
409
|
+
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
|
410
|
+
|
|
411
|
+
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
|
412
|
+
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
|
413
|
+
{
|
|
414
|
+
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
|
415
|
+
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
|
416
|
+
|
|
417
|
+
// this helps the compiler generating good binary code
|
|
418
|
+
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
|
419
|
+
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
|
420
|
+
|
|
421
|
+
if (Vectorizable)
|
|
422
|
+
{
|
|
423
|
+
/* explicit vectorization */
|
|
424
|
+
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
|
425
|
+
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
|
426
|
+
|
|
427
|
+
// process initial unaligned coeffs
|
|
428
|
+
// FIXME this loop get vectorized by the compiler !
|
|
429
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
430
|
+
{
|
|
431
|
+
RhsScalar b = rhs[j];
|
|
432
|
+
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
|
433
|
+
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
if (alignedSize>alignedStart)
|
|
437
|
+
{
|
|
438
|
+
switch(alignmentPattern)
|
|
439
|
+
{
|
|
440
|
+
case AllAligned:
|
|
441
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
442
|
+
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
|
443
|
+
break;
|
|
444
|
+
case EvenAligned:
|
|
445
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
446
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
|
447
|
+
break;
|
|
448
|
+
case FirstAligned:
|
|
449
|
+
{
|
|
450
|
+
Index j = alignedStart;
|
|
451
|
+
if (peels>1)
|
|
452
|
+
{
|
|
453
|
+
/* Here we proccess 4 rows with with two peeled iterations to hide
|
|
454
|
+
* the overhead of unaligned loads. Moreover unaligned loads are handled
|
|
455
|
+
* using special shift/move operations between the two aligned packets
|
|
456
|
+
* overlaping the desired unaligned packet. This is *much* more efficient
|
|
457
|
+
* than basic unaligned loads.
|
|
458
|
+
*/
|
|
459
|
+
LhsPacket A01, A02, A03, A11, A12, A13;
|
|
460
|
+
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
|
461
|
+
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
|
462
|
+
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
|
463
|
+
|
|
464
|
+
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
|
465
|
+
{
|
|
466
|
+
RhsPacket b = pload<RhsPacket>(&rhs[j]);
|
|
467
|
+
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
|
468
|
+
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
|
469
|
+
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
|
470
|
+
|
|
471
|
+
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
|
|
472
|
+
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
|
473
|
+
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
|
474
|
+
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
|
475
|
+
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
|
476
|
+
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
|
477
|
+
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
|
478
|
+
|
|
479
|
+
b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
|
|
480
|
+
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
|
|
481
|
+
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
|
482
|
+
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
|
483
|
+
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
for (; j<alignedSize; j+=RhsPacketSize)
|
|
487
|
+
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
|
488
|
+
break;
|
|
489
|
+
}
|
|
490
|
+
default:
|
|
491
|
+
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
492
|
+
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
|
493
|
+
break;
|
|
494
|
+
}
|
|
495
|
+
tmp0 += predux(ptmp0);
|
|
496
|
+
tmp1 += predux(ptmp1);
|
|
497
|
+
tmp2 += predux(ptmp2);
|
|
498
|
+
tmp3 += predux(ptmp3);
|
|
499
|
+
}
|
|
500
|
+
} // end explicit vectorization
|
|
501
|
+
|
|
502
|
+
// process remaining coeffs (or all if no explicit vectorization)
|
|
503
|
+
// FIXME this loop get vectorized by the compiler !
|
|
504
|
+
for (Index j=alignedSize; j<depth; ++j)
|
|
505
|
+
{
|
|
506
|
+
RhsScalar b = rhs[j];
|
|
507
|
+
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
|
508
|
+
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
|
509
|
+
}
|
|
510
|
+
res[i*resIncr] += alpha*tmp0;
|
|
511
|
+
res[(i+offset1)*resIncr] += alpha*tmp1;
|
|
512
|
+
res[(i+2)*resIncr] += alpha*tmp2;
|
|
513
|
+
res[(i+offset3)*resIncr] += alpha*tmp3;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// process remaining first and last rows (at most columnsAtOnce-1)
|
|
517
|
+
Index end = rows;
|
|
518
|
+
Index start = rowBound;
|
|
519
|
+
do
|
|
520
|
+
{
|
|
521
|
+
for (Index i=start; i<end; ++i)
|
|
522
|
+
{
|
|
523
|
+
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
|
524
|
+
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
|
525
|
+
const LhsScalar* lhs0 = lhs + i*lhsStride;
|
|
526
|
+
// process first unaligned result's coeffs
|
|
527
|
+
// FIXME this loop get vectorized by the compiler !
|
|
528
|
+
for (Index j=0; j<alignedStart; ++j)
|
|
529
|
+
tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
|
530
|
+
|
|
531
|
+
if (alignedSize>alignedStart)
|
|
532
|
+
{
|
|
533
|
+
// process aligned rhs coeffs
|
|
534
|
+
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
|
535
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
536
|
+
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
|
537
|
+
else
|
|
538
|
+
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
539
|
+
ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
|
540
|
+
tmp0 += predux(ptmp0);
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// process remaining scalars
|
|
544
|
+
// FIXME this loop get vectorized by the compiler !
|
|
545
|
+
for (Index j=alignedSize; j<depth; ++j)
|
|
546
|
+
tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
|
547
|
+
res[i*resIncr] += alpha*tmp0;
|
|
548
|
+
}
|
|
549
|
+
if (skipRows)
|
|
550
|
+
{
|
|
551
|
+
start = 0;
|
|
552
|
+
end = skipRows;
|
|
553
|
+
skipRows = 0;
|
|
554
|
+
}
|
|
555
|
+
else
|
|
556
|
+
break;
|
|
557
|
+
} while(Vectorizable);
|
|
558
|
+
|
|
559
|
+
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
} // end namespace internal
|
|
563
|
+
|
|
564
|
+
} // end namespace Eigen
|
|
565
|
+
|
|
566
|
+
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|