tomoto 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +123 -0
- data/ext/tomoto/ext.cpp +245 -0
- data/ext/tomoto/extconf.rb +28 -0
- data/lib/tomoto.rb +12 -0
- data/lib/tomoto/ct.rb +11 -0
- data/lib/tomoto/hdp.rb +11 -0
- data/lib/tomoto/lda.rb +67 -0
- data/lib/tomoto/version.rb +3 -0
- data/vendor/EigenRand/EigenRand/Core.h +1139 -0
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
- data/vendor/EigenRand/EigenRand/EigenRand +19 -0
- data/vendor/EigenRand/EigenRand/Macro.h +24 -0
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
- data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
- data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
- data/vendor/EigenRand/EigenRand/doc.h +220 -0
- data/vendor/EigenRand/LICENSE +21 -0
- data/vendor/EigenRand/README.md +288 -0
- data/vendor/eigen/COPYING.BSD +26 -0
- data/vendor/eigen/COPYING.GPL +674 -0
- data/vendor/eigen/COPYING.LGPL +502 -0
- data/vendor/eigen/COPYING.MINPACK +52 -0
- data/vendor/eigen/COPYING.MPL2 +373 -0
- data/vendor/eigen/COPYING.README +18 -0
- data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
- data/vendor/eigen/Eigen/Cholesky +46 -0
- data/vendor/eigen/Eigen/CholmodSupport +48 -0
- data/vendor/eigen/Eigen/Core +537 -0
- data/vendor/eigen/Eigen/Dense +7 -0
- data/vendor/eigen/Eigen/Eigen +2 -0
- data/vendor/eigen/Eigen/Eigenvalues +61 -0
- data/vendor/eigen/Eigen/Geometry +62 -0
- data/vendor/eigen/Eigen/Householder +30 -0
- data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
- data/vendor/eigen/Eigen/Jacobi +33 -0
- data/vendor/eigen/Eigen/LU +50 -0
- data/vendor/eigen/Eigen/MetisSupport +35 -0
- data/vendor/eigen/Eigen/OrderingMethods +73 -0
- data/vendor/eigen/Eigen/PaStiXSupport +48 -0
- data/vendor/eigen/Eigen/PardisoSupport +35 -0
- data/vendor/eigen/Eigen/QR +51 -0
- data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
- data/vendor/eigen/Eigen/SPQRSupport +34 -0
- data/vendor/eigen/Eigen/SVD +51 -0
- data/vendor/eigen/Eigen/Sparse +36 -0
- data/vendor/eigen/Eigen/SparseCholesky +45 -0
- data/vendor/eigen/Eigen/SparseCore +69 -0
- data/vendor/eigen/Eigen/SparseLU +46 -0
- data/vendor/eigen/Eigen/SparseQR +37 -0
- data/vendor/eigen/Eigen/StdDeque +27 -0
- data/vendor/eigen/Eigen/StdList +26 -0
- data/vendor/eigen/Eigen/StdVector +27 -0
- data/vendor/eigen/Eigen/SuperLUSupport +64 -0
- data/vendor/eigen/Eigen/UmfPackSupport +40 -0
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
- data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
- data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
- data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
- data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
- data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
- data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
- data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
- data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
- data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
- data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
- data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
- data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
- data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
- data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
- data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
- data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
- data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
- data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
- data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
- data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
- data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
- data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
- data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
- data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
- data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
- data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
- data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
- data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
- data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
- data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
- data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
- data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
- data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
- data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
- data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
- data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
- data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
- data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
- data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
- data/vendor/eigen/README.md +3 -0
- data/vendor/eigen/bench/README.txt +55 -0
- data/vendor/eigen/bench/btl/COPYING +340 -0
- data/vendor/eigen/bench/btl/README +154 -0
- data/vendor/eigen/bench/tensors/README +21 -0
- data/vendor/eigen/blas/README.txt +6 -0
- data/vendor/eigen/demos/mandelbrot/README +10 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
- data/vendor/eigen/demos/opengl/README +13 -0
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
- data/vendor/eigen/unsupported/README.txt +50 -0
- data/vendor/tomotopy/LICENSE +21 -0
- data/vendor/tomotopy/README.kr.rst +375 -0
- data/vendor/tomotopy/README.rst +382 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
- data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
- data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
- data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
- data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
- data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
- data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
- data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
- data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
- data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
- data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
- data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
- data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
- data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
- data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
- data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
- data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
- data/vendor/tomotopy/src/Utils/exception.h +28 -0
- data/vendor/tomotopy/src/Utils/math.h +281 -0
- data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
- data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
- data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
- data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
- data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
- data/vendor/tomotopy/src/Utils/text.hpp +49 -0
- data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
- metadata +531 -0
@@ -0,0 +1,2156 @@
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
2
|
+
// for linear algebra.
|
3
|
+
//
|
4
|
+
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
|
5
|
+
//
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
9
|
+
|
10
|
+
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
|
11
|
+
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
12
|
+
|
13
|
+
|
14
|
+
namespace Eigen {
|
15
|
+
|
16
|
+
namespace internal {
|
17
|
+
|
18
|
+
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
|
19
|
+
class gebp_traits;
|
20
|
+
|
21
|
+
|
22
|
+
/** \internal \returns b if a<=0, and returns a otherwise. */
|
23
|
+
inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
|
24
|
+
{
|
25
|
+
return a<=0 ? b : a;
|
26
|
+
}
|
27
|
+
|
28
|
+
#if EIGEN_ARCH_i386_OR_x86_64
|
29
|
+
const std::ptrdiff_t defaultL1CacheSize = 32*1024;
|
30
|
+
const std::ptrdiff_t defaultL2CacheSize = 256*1024;
|
31
|
+
const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
|
32
|
+
#else
|
33
|
+
const std::ptrdiff_t defaultL1CacheSize = 16*1024;
|
34
|
+
const std::ptrdiff_t defaultL2CacheSize = 512*1024;
|
35
|
+
const std::ptrdiff_t defaultL3CacheSize = 512*1024;
|
36
|
+
#endif
|
37
|
+
|
38
|
+
/** \internal */
|
39
|
+
struct CacheSizes {
|
40
|
+
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
|
41
|
+
int l1CacheSize, l2CacheSize, l3CacheSize;
|
42
|
+
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
|
43
|
+
m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
|
44
|
+
m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
|
45
|
+
m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
|
46
|
+
}
|
47
|
+
|
48
|
+
std::ptrdiff_t m_l1;
|
49
|
+
std::ptrdiff_t m_l2;
|
50
|
+
std::ptrdiff_t m_l3;
|
51
|
+
};
|
52
|
+
|
53
|
+
|
54
|
+
/** \internal */
|
55
|
+
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
|
56
|
+
{
|
57
|
+
static CacheSizes m_cacheSizes;
|
58
|
+
|
59
|
+
if(action==SetAction)
|
60
|
+
{
|
61
|
+
// set the cpu cache size and cache all block sizes from a global cache size in byte
|
62
|
+
eigen_internal_assert(l1!=0 && l2!=0);
|
63
|
+
m_cacheSizes.m_l1 = *l1;
|
64
|
+
m_cacheSizes.m_l2 = *l2;
|
65
|
+
m_cacheSizes.m_l3 = *l3;
|
66
|
+
}
|
67
|
+
else if(action==GetAction)
|
68
|
+
{
|
69
|
+
eigen_internal_assert(l1!=0 && l2!=0);
|
70
|
+
*l1 = m_cacheSizes.m_l1;
|
71
|
+
*l2 = m_cacheSizes.m_l2;
|
72
|
+
*l3 = m_cacheSizes.m_l3;
|
73
|
+
}
|
74
|
+
else
|
75
|
+
{
|
76
|
+
eigen_internal_assert(false);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
/* Helper for computeProductBlockingSizes.
|
81
|
+
*
|
82
|
+
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
|
83
|
+
* this function computes the blocking size parameters along the respective dimensions
|
84
|
+
* for matrix products and related algorithms. The blocking sizes depends on various
|
85
|
+
* parameters:
|
86
|
+
* - the L1 and L2 cache sizes,
|
87
|
+
* - the register level blocking sizes defined by gebp_traits,
|
88
|
+
* - the number of scalars that fit into a packet (when vectorization is enabled).
|
89
|
+
*
|
90
|
+
* \sa setCpuCacheSizes */
|
91
|
+
|
92
|
+
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
93
|
+
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
|
94
|
+
{
|
95
|
+
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
96
|
+
|
97
|
+
// Explanations:
|
98
|
+
// Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
|
99
|
+
// kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
|
100
|
+
// per mr x kc horizontal small panels where mr is the blocking size along the m dimension
|
101
|
+
// at the register level. This small horizontal panel has to stay within L1 cache.
|
102
|
+
std::ptrdiff_t l1, l2, l3;
|
103
|
+
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
104
|
+
|
105
|
+
if (num_threads > 1) {
|
106
|
+
typedef typename Traits::ResScalar ResScalar;
|
107
|
+
enum {
|
108
|
+
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
109
|
+
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
110
|
+
kr = 8,
|
111
|
+
mr = Traits::mr,
|
112
|
+
nr = Traits::nr
|
113
|
+
};
|
114
|
+
// Increasing k gives us more time to prefetch the content of the "C"
|
115
|
+
// registers. However once the latency is hidden there is no point in
|
116
|
+
// increasing the value of k, so we'll cap it at 320 (value determined
|
117
|
+
// experimentally).
|
118
|
+
const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
|
119
|
+
if (k_cache < k) {
|
120
|
+
k = k_cache - (k_cache % kr);
|
121
|
+
eigen_internal_assert(k > 0);
|
122
|
+
}
|
123
|
+
|
124
|
+
const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
|
125
|
+
const Index n_per_thread = numext::div_ceil(n, num_threads);
|
126
|
+
if (n_cache <= n_per_thread) {
|
127
|
+
// Don't exceed the capacity of the l2 cache.
|
128
|
+
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
129
|
+
n = n_cache - (n_cache % nr);
|
130
|
+
eigen_internal_assert(n > 0);
|
131
|
+
} else {
|
132
|
+
n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
133
|
+
}
|
134
|
+
|
135
|
+
if (l3 > l2) {
|
136
|
+
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
|
137
|
+
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
138
|
+
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
139
|
+
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
140
|
+
m = m_cache - (m_cache % mr);
|
141
|
+
eigen_internal_assert(m > 0);
|
142
|
+
} else {
|
143
|
+
m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
144
|
+
}
|
145
|
+
}
|
146
|
+
}
|
147
|
+
else {
|
148
|
+
// In unit tests we do not want to use extra large matrices,
|
149
|
+
// so we reduce the cache size to check the blocking strategy is not flawed
|
150
|
+
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
151
|
+
l1 = 9*1024;
|
152
|
+
l2 = 32*1024;
|
153
|
+
l3 = 512*1024;
|
154
|
+
#endif
|
155
|
+
|
156
|
+
// Early return for small problems because the computation below are time consuming for small problems.
|
157
|
+
// Perhaps it would make more sense to consider k*n*m??
|
158
|
+
// Note that for very tiny problem, this function should be bypassed anyway
|
159
|
+
// because we use the coefficient-based implementation for them.
|
160
|
+
if((numext::maxi)(k,(numext::maxi)(m,n))<48)
|
161
|
+
return;
|
162
|
+
|
163
|
+
typedef typename Traits::ResScalar ResScalar;
|
164
|
+
enum {
|
165
|
+
k_peeling = 8,
|
166
|
+
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
167
|
+
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
|
168
|
+
};
|
169
|
+
|
170
|
+
// ---- 1st level of blocking on L1, yields kc ----
|
171
|
+
|
172
|
+
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
|
173
|
+
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
|
174
|
+
// We also include a register-level block of the result (mx x nr).
|
175
|
+
// (In an ideal world only the lhs panel would stay in L1)
|
176
|
+
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
177
|
+
const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
|
178
|
+
const Index old_k = k;
|
179
|
+
if(k>max_kc)
|
180
|
+
{
|
181
|
+
// We are really blocking on the third dimension:
|
182
|
+
// -> reduce blocking size to make sure the last block is as large as possible
|
183
|
+
// while keeping the same number of sweeps over the result.
|
184
|
+
k = (k%max_kc)==0 ? max_kc
|
185
|
+
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
|
186
|
+
|
187
|
+
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
|
188
|
+
}
|
189
|
+
|
190
|
+
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
|
191
|
+
|
192
|
+
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
|
193
|
+
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
|
194
|
+
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
|
195
|
+
// For instance, it corresponds to 6MB of L3 shared among 4 cores.
|
196
|
+
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
197
|
+
const Index actual_l2 = l3;
|
198
|
+
#else
|
199
|
+
const Index actual_l2 = 1572864; // == 1.5 MB
|
200
|
+
#endif
|
201
|
+
|
202
|
+
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
|
203
|
+
// The second half is implicitly reserved to access the result and lhs coefficients.
|
204
|
+
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
|
205
|
+
// to limit this growth: we bound nc to growth by a factor x1.5.
|
206
|
+
// However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
|
207
|
+
// and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
|
208
|
+
Index max_nc;
|
209
|
+
const Index lhs_bytes = m * k * sizeof(LhsScalar);
|
210
|
+
const Index remaining_l1 = l1- k_sub - lhs_bytes;
|
211
|
+
if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
|
212
|
+
{
|
213
|
+
// L1 blocking
|
214
|
+
max_nc = remaining_l1 / (k*sizeof(RhsScalar));
|
215
|
+
}
|
216
|
+
else
|
217
|
+
{
|
218
|
+
// L2 blocking
|
219
|
+
max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
220
|
+
}
|
221
|
+
// WARNING Below, we assume that Traits::nr is a power of two.
|
222
|
+
Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
223
|
+
if(n>nc)
|
224
|
+
{
|
225
|
+
// We are really blocking over the columns:
|
226
|
+
// -> reduce blocking size to make sure the last block is as large as possible
|
227
|
+
// while keeping the same number of sweeps over the packed lhs.
|
228
|
+
// Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
|
229
|
+
n = (n%nc)==0 ? nc
|
230
|
+
: (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
|
231
|
+
}
|
232
|
+
else if(old_k==k)
|
233
|
+
{
|
234
|
+
// So far, no blocking at all, i.e., kc==k, and nc==n.
|
235
|
+
// In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
|
236
|
+
// TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
|
237
|
+
Index problem_size = k*n*sizeof(LhsScalar);
|
238
|
+
Index actual_lm = actual_l2;
|
239
|
+
Index max_mc = m;
|
240
|
+
if(problem_size<=1024)
|
241
|
+
{
|
242
|
+
// problem is small enough to keep in L1
|
243
|
+
// Let's choose m such that lhs's block fit in 1/3 of L1
|
244
|
+
actual_lm = l1;
|
245
|
+
}
|
246
|
+
else if(l3!=0 && problem_size<=32768)
|
247
|
+
{
|
248
|
+
// we have both L2 and L3, and problem is small enough to be kept in L2
|
249
|
+
// Let's choose m such that lhs's block fit in 1/3 of L2
|
250
|
+
actual_lm = l2;
|
251
|
+
max_mc = (numext::mini<Index>)(576,max_mc);
|
252
|
+
}
|
253
|
+
Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
254
|
+
if (mc > Traits::mr) mc -= mc % Traits::mr;
|
255
|
+
else if (mc==0) return;
|
256
|
+
m = (m%mc)==0 ? mc
|
257
|
+
: (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
|
258
|
+
}
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
template <typename Index>
|
263
|
+
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
264
|
+
{
|
265
|
+
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
266
|
+
if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
|
267
|
+
k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
268
|
+
m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
269
|
+
n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
270
|
+
return true;
|
271
|
+
}
|
272
|
+
#else
|
273
|
+
EIGEN_UNUSED_VARIABLE(k)
|
274
|
+
EIGEN_UNUSED_VARIABLE(m)
|
275
|
+
EIGEN_UNUSED_VARIABLE(n)
|
276
|
+
#endif
|
277
|
+
return false;
|
278
|
+
}
|
279
|
+
|
280
|
+
/** \brief Computes the blocking parameters for a m x k times k x n matrix product
|
281
|
+
*
|
282
|
+
* \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
|
283
|
+
* \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
|
284
|
+
* \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
|
285
|
+
*
|
286
|
+
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
|
287
|
+
* this function computes the blocking size parameters along the respective dimensions
|
288
|
+
* for matrix products and related algorithms.
|
289
|
+
*
|
290
|
+
* The blocking size parameters may be evaluated:
|
291
|
+
* - either by a heuristic based on cache sizes;
|
292
|
+
* - or using fixed prescribed values (for testing purposes).
|
293
|
+
*
|
294
|
+
* \sa setCpuCacheSizes */
|
295
|
+
|
296
|
+
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
297
|
+
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
298
|
+
{
|
299
|
+
if (!useSpecificBlockingSizes(k, m, n)) {
|
300
|
+
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
template<typename LhsScalar, typename RhsScalar, typename Index>
|
305
|
+
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
306
|
+
{
|
307
|
+
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
|
308
|
+
}
|
309
|
+
|
310
|
+
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
311
|
+
#define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
|
312
|
+
#else
|
313
|
+
|
314
|
+
// FIXME (a bit overkill maybe ?)
|
315
|
+
|
316
|
+
template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
|
317
|
+
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
|
318
|
+
{
|
319
|
+
c = cj.pmadd(a,b,c);
|
320
|
+
}
|
321
|
+
};
|
322
|
+
|
323
|
+
template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
|
324
|
+
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
|
325
|
+
{
|
326
|
+
t = b; t = cj.pmul(a,t); c = padd(c,t);
|
327
|
+
}
|
328
|
+
};
|
329
|
+
|
330
|
+
template<typename CJ, typename A, typename B, typename C, typename T>
|
331
|
+
EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
|
332
|
+
{
|
333
|
+
gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
|
334
|
+
}
|
335
|
+
|
336
|
+
#define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
|
337
|
+
// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
|
338
|
+
#endif
|
339
|
+
|
340
|
+
/* Vectorization logic
|
341
|
+
* real*real: unpack rhs to constant packets, ...
|
342
|
+
*
|
343
|
+
* cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
|
344
|
+
* storing each res packet into two packets (2x2),
|
345
|
+
* at the end combine them: swap the second and addsub them
|
346
|
+
* cf*cf : same but with 2x4 blocks
|
347
|
+
* cplx*real : unpack rhs to constant packets, ...
|
348
|
+
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
|
349
|
+
*/
|
350
|
+
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
|
351
|
+
class gebp_traits
|
352
|
+
{
|
353
|
+
public:
|
354
|
+
typedef _LhsScalar LhsScalar;
|
355
|
+
typedef _RhsScalar RhsScalar;
|
356
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
357
|
+
|
358
|
+
enum {
|
359
|
+
ConjLhs = _ConjLhs,
|
360
|
+
ConjRhs = _ConjRhs,
|
361
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
|
362
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
363
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
364
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
|
365
|
+
|
366
|
+
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
367
|
+
|
368
|
+
// register block size along the N direction must be 1 or 4
|
369
|
+
nr = 4,
|
370
|
+
|
371
|
+
// register block size along the M direction (currently, this one cannot be modified)
|
372
|
+
default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
|
373
|
+
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
|
374
|
+
// we assume 16 registers
|
375
|
+
// See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
|
376
|
+
// then using 3*LhsPacketSize triggers non-implemented paths in syrk.
|
377
|
+
mr = Vectorizable ? 3*LhsPacketSize : default_mr,
|
378
|
+
#else
|
379
|
+
mr = default_mr,
|
380
|
+
#endif
|
381
|
+
|
382
|
+
LhsProgress = LhsPacketSize,
|
383
|
+
RhsProgress = 1
|
384
|
+
};
|
385
|
+
|
386
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
387
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
388
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
389
|
+
|
390
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
391
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
392
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
393
|
+
|
394
|
+
typedef ResPacket AccPacket;
|
395
|
+
|
396
|
+
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
397
|
+
{
|
398
|
+
p = pset1<ResPacket>(ResScalar(0));
|
399
|
+
}
|
400
|
+
|
401
|
+
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
|
402
|
+
{
|
403
|
+
pbroadcast4(b, b0, b1, b2, b3);
|
404
|
+
}
|
405
|
+
|
406
|
+
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
407
|
+
// {
|
408
|
+
// pbroadcast2(b, b0, b1);
|
409
|
+
// }
|
410
|
+
|
411
|
+
template<typename RhsPacketType>
|
412
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
413
|
+
{
|
414
|
+
dest = pset1<RhsPacketType>(*b);
|
415
|
+
}
|
416
|
+
|
417
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
418
|
+
{
|
419
|
+
dest = ploadquad<RhsPacket>(b);
|
420
|
+
}
|
421
|
+
|
422
|
+
template<typename LhsPacketType>
|
423
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
|
424
|
+
{
|
425
|
+
dest = pload<LhsPacketType>(a);
|
426
|
+
}
|
427
|
+
|
428
|
+
template<typename LhsPacketType>
|
429
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
430
|
+
{
|
431
|
+
dest = ploadu<LhsPacketType>(a);
|
432
|
+
}
|
433
|
+
|
434
|
+
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
435
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
|
436
|
+
{
|
437
|
+
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
|
438
|
+
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
|
439
|
+
// let gcc allocate the register in which to store the result of the pmul
|
440
|
+
// (in the case where there is no FMA) gcc fails to figure out how to avoid
|
441
|
+
// spilling register.
|
442
|
+
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
443
|
+
EIGEN_UNUSED_VARIABLE(tmp);
|
444
|
+
c = cj.pmadd(a,b,c);
|
445
|
+
#else
|
446
|
+
tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
|
447
|
+
#endif
|
448
|
+
}
|
449
|
+
|
450
|
+
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
451
|
+
{
|
452
|
+
r = pmadd(c,alpha,r);
|
453
|
+
}
|
454
|
+
|
455
|
+
template<typename ResPacketHalf>
|
456
|
+
EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
|
457
|
+
{
|
458
|
+
r = pmadd(c,alpha,r);
|
459
|
+
}
|
460
|
+
|
461
|
+
};
|
462
|
+
|
463
|
+
template<typename RealScalar, bool _ConjLhs>
|
464
|
+
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
|
465
|
+
{
|
466
|
+
public:
|
467
|
+
typedef std::complex<RealScalar> LhsScalar;
|
468
|
+
typedef RealScalar RhsScalar;
|
469
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
470
|
+
|
471
|
+
enum {
|
472
|
+
ConjLhs = _ConjLhs,
|
473
|
+
ConjRhs = false,
|
474
|
+
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
|
475
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
476
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
477
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
|
478
|
+
|
479
|
+
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
480
|
+
nr = 4,
|
481
|
+
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
|
482
|
+
// we assume 16 registers
|
483
|
+
mr = 3*LhsPacketSize,
|
484
|
+
#else
|
485
|
+
mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
|
486
|
+
#endif
|
487
|
+
|
488
|
+
LhsProgress = LhsPacketSize,
|
489
|
+
RhsProgress = 1
|
490
|
+
};
|
491
|
+
|
492
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
493
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
494
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
495
|
+
|
496
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
497
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
498
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
499
|
+
|
500
|
+
typedef ResPacket AccPacket;
|
501
|
+
|
502
|
+
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
503
|
+
{
|
504
|
+
p = pset1<ResPacket>(ResScalar(0));
|
505
|
+
}
|
506
|
+
|
507
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
508
|
+
{
|
509
|
+
dest = pset1<RhsPacket>(*b);
|
510
|
+
}
|
511
|
+
|
512
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
513
|
+
{
|
514
|
+
dest = pset1<RhsPacket>(*b);
|
515
|
+
}
|
516
|
+
|
517
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
518
|
+
{
|
519
|
+
dest = pload<LhsPacket>(a);
|
520
|
+
}
|
521
|
+
|
522
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
|
523
|
+
{
|
524
|
+
dest = ploadu<LhsPacket>(a);
|
525
|
+
}
|
526
|
+
|
527
|
+
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
|
528
|
+
{
|
529
|
+
pbroadcast4(b, b0, b1, b2, b3);
|
530
|
+
}
|
531
|
+
|
532
|
+
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
533
|
+
// {
|
534
|
+
// pbroadcast2(b, b0, b1);
|
535
|
+
// }
|
536
|
+
|
537
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
|
538
|
+
{
|
539
|
+
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
|
540
|
+
}
|
541
|
+
|
542
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
|
543
|
+
{
|
544
|
+
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
545
|
+
EIGEN_UNUSED_VARIABLE(tmp);
|
546
|
+
c.v = pmadd(a.v,b,c.v);
|
547
|
+
#else
|
548
|
+
tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
|
549
|
+
#endif
|
550
|
+
}
|
551
|
+
|
552
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
|
553
|
+
{
|
554
|
+
c += a * b;
|
555
|
+
}
|
556
|
+
|
557
|
+
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
558
|
+
{
|
559
|
+
r = cj.pmadd(c,alpha,r);
|
560
|
+
}
|
561
|
+
|
562
|
+
protected:
|
563
|
+
conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
|
564
|
+
};
|
565
|
+
|
566
|
+
template<typename Packet>
|
567
|
+
struct DoublePacket
|
568
|
+
{
|
569
|
+
Packet first;
|
570
|
+
Packet second;
|
571
|
+
};
|
572
|
+
|
573
|
+
template<typename Packet>
|
574
|
+
DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
|
575
|
+
{
|
576
|
+
DoublePacket<Packet> res;
|
577
|
+
res.first = padd(a.first, b.first);
|
578
|
+
res.second = padd(a.second,b.second);
|
579
|
+
return res;
|
580
|
+
}
|
581
|
+
|
582
|
+
template<typename Packet>
|
583
|
+
const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
|
584
|
+
{
|
585
|
+
return a;
|
586
|
+
}
|
587
|
+
|
588
|
+
template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
|
589
|
+
// template<typename Packet>
|
590
|
+
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
|
591
|
+
// {
|
592
|
+
// DoublePacket<Packet> res;
|
593
|
+
// res.first = padd(a.first, b.first);
|
594
|
+
// res.second = padd(a.second,b.second);
|
595
|
+
// return res;
|
596
|
+
// }
|
597
|
+
|
598
|
+
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
|
599
|
+
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
|
600
|
+
{
|
601
|
+
public:
|
602
|
+
typedef std::complex<RealScalar> Scalar;
|
603
|
+
typedef std::complex<RealScalar> LhsScalar;
|
604
|
+
typedef std::complex<RealScalar> RhsScalar;
|
605
|
+
typedef std::complex<RealScalar> ResScalar;
|
606
|
+
|
607
|
+
enum {
|
608
|
+
ConjLhs = _ConjLhs,
|
609
|
+
ConjRhs = _ConjRhs,
|
610
|
+
Vectorizable = packet_traits<RealScalar>::Vectorizable
|
611
|
+
&& packet_traits<Scalar>::Vectorizable,
|
612
|
+
RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
|
613
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
|
614
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
615
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
616
|
+
|
617
|
+
// FIXME: should depend on NumberOfRegisters
|
618
|
+
nr = 4,
|
619
|
+
mr = ResPacketSize,
|
620
|
+
|
621
|
+
LhsProgress = ResPacketSize,
|
622
|
+
RhsProgress = 1
|
623
|
+
};
|
624
|
+
|
625
|
+
typedef typename packet_traits<RealScalar>::type RealPacket;
|
626
|
+
typedef typename packet_traits<Scalar>::type ScalarPacket;
|
627
|
+
typedef DoublePacket<RealPacket> DoublePacketType;
|
628
|
+
|
629
|
+
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
|
630
|
+
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
|
631
|
+
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
|
632
|
+
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
|
633
|
+
|
634
|
+
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
|
635
|
+
|
636
|
+
EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
|
637
|
+
{
|
638
|
+
p.first = pset1<RealPacket>(RealScalar(0));
|
639
|
+
p.second = pset1<RealPacket>(RealScalar(0));
|
640
|
+
}
|
641
|
+
|
642
|
+
// Scalar path
|
643
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
|
644
|
+
{
|
645
|
+
dest = pset1<ResPacket>(*b);
|
646
|
+
}
|
647
|
+
|
648
|
+
// Vectorized path
|
649
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
|
650
|
+
{
|
651
|
+
dest.first = pset1<RealPacket>(real(*b));
|
652
|
+
dest.second = pset1<RealPacket>(imag(*b));
|
653
|
+
}
|
654
|
+
|
655
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
|
656
|
+
{
|
657
|
+
loadRhs(b,dest);
|
658
|
+
}
|
659
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
|
660
|
+
{
|
661
|
+
eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
|
662
|
+
loadRhs(b,dest);
|
663
|
+
}
|
664
|
+
|
665
|
+
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
|
666
|
+
{
|
667
|
+
// FIXME not sure that's the best way to implement it!
|
668
|
+
loadRhs(b+0, b0);
|
669
|
+
loadRhs(b+1, b1);
|
670
|
+
loadRhs(b+2, b2);
|
671
|
+
loadRhs(b+3, b3);
|
672
|
+
}
|
673
|
+
|
674
|
+
// Vectorized path
|
675
|
+
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
|
676
|
+
{
|
677
|
+
// FIXME not sure that's the best way to implement it!
|
678
|
+
loadRhs(b+0, b0);
|
679
|
+
loadRhs(b+1, b1);
|
680
|
+
}
|
681
|
+
|
682
|
+
// Scalar path
|
683
|
+
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
|
684
|
+
{
|
685
|
+
// FIXME not sure that's the best way to implement it!
|
686
|
+
loadRhs(b+0, b0);
|
687
|
+
loadRhs(b+1, b1);
|
688
|
+
}
|
689
|
+
|
690
|
+
// nothing special here
|
691
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
692
|
+
{
|
693
|
+
dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
|
694
|
+
}
|
695
|
+
|
696
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
|
697
|
+
{
|
698
|
+
dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
|
699
|
+
}
|
700
|
+
|
701
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
|
702
|
+
{
|
703
|
+
c.first = padd(pmul(a,b.first), c.first);
|
704
|
+
c.second = padd(pmul(a,b.second),c.second);
|
705
|
+
}
|
706
|
+
|
707
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
|
708
|
+
{
|
709
|
+
c = cj.pmadd(a,b,c);
|
710
|
+
}
|
711
|
+
|
712
|
+
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
|
713
|
+
|
714
|
+
EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
|
715
|
+
{
|
716
|
+
// assemble c
|
717
|
+
ResPacket tmp;
|
718
|
+
if((!ConjLhs)&&(!ConjRhs))
|
719
|
+
{
|
720
|
+
tmp = pcplxflip(pconj(ResPacket(c.second)));
|
721
|
+
tmp = padd(ResPacket(c.first),tmp);
|
722
|
+
}
|
723
|
+
else if((!ConjLhs)&&(ConjRhs))
|
724
|
+
{
|
725
|
+
tmp = pconj(pcplxflip(ResPacket(c.second)));
|
726
|
+
tmp = padd(ResPacket(c.first),tmp);
|
727
|
+
}
|
728
|
+
else if((ConjLhs)&&(!ConjRhs))
|
729
|
+
{
|
730
|
+
tmp = pcplxflip(ResPacket(c.second));
|
731
|
+
tmp = padd(pconj(ResPacket(c.first)),tmp);
|
732
|
+
}
|
733
|
+
else if((ConjLhs)&&(ConjRhs))
|
734
|
+
{
|
735
|
+
tmp = pcplxflip(ResPacket(c.second));
|
736
|
+
tmp = psub(pconj(ResPacket(c.first)),tmp);
|
737
|
+
}
|
738
|
+
|
739
|
+
r = pmadd(tmp,alpha,r);
|
740
|
+
}
|
741
|
+
|
742
|
+
protected:
|
743
|
+
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
744
|
+
};
|
745
|
+
|
746
|
+
template<typename RealScalar, bool _ConjRhs>
|
747
|
+
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
|
748
|
+
{
|
749
|
+
public:
|
750
|
+
typedef std::complex<RealScalar> Scalar;
|
751
|
+
typedef RealScalar LhsScalar;
|
752
|
+
typedef Scalar RhsScalar;
|
753
|
+
typedef Scalar ResScalar;
|
754
|
+
|
755
|
+
enum {
|
756
|
+
ConjLhs = false,
|
757
|
+
ConjRhs = _ConjRhs,
|
758
|
+
Vectorizable = packet_traits<RealScalar>::Vectorizable
|
759
|
+
&& packet_traits<Scalar>::Vectorizable,
|
760
|
+
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
761
|
+
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
762
|
+
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
|
763
|
+
|
764
|
+
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
765
|
+
// FIXME: should depend on NumberOfRegisters
|
766
|
+
nr = 4,
|
767
|
+
mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
|
768
|
+
|
769
|
+
LhsProgress = ResPacketSize,
|
770
|
+
RhsProgress = 1
|
771
|
+
};
|
772
|
+
|
773
|
+
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
774
|
+
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
775
|
+
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
776
|
+
|
777
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
778
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
779
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
780
|
+
|
781
|
+
typedef ResPacket AccPacket;
|
782
|
+
|
783
|
+
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
784
|
+
{
|
785
|
+
p = pset1<ResPacket>(ResScalar(0));
|
786
|
+
}
|
787
|
+
|
788
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
789
|
+
{
|
790
|
+
dest = pset1<RhsPacket>(*b);
|
791
|
+
}
|
792
|
+
|
793
|
+
void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
|
794
|
+
{
|
795
|
+
pbroadcast4(b, b0, b1, b2, b3);
|
796
|
+
}
|
797
|
+
|
798
|
+
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
799
|
+
// {
|
800
|
+
// // FIXME not sure that's the best way to implement it!
|
801
|
+
// b0 = pload1<RhsPacket>(b+0);
|
802
|
+
// b1 = pload1<RhsPacket>(b+1);
|
803
|
+
// }
|
804
|
+
|
805
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
806
|
+
{
|
807
|
+
dest = ploaddup<LhsPacket>(a);
|
808
|
+
}
|
809
|
+
|
810
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
811
|
+
{
|
812
|
+
eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
|
813
|
+
loadRhs(b,dest);
|
814
|
+
}
|
815
|
+
|
816
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
|
817
|
+
{
|
818
|
+
dest = ploaddup<LhsPacket>(a);
|
819
|
+
}
|
820
|
+
|
821
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
|
822
|
+
{
|
823
|
+
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
|
824
|
+
}
|
825
|
+
|
826
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
|
827
|
+
{
|
828
|
+
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
829
|
+
EIGEN_UNUSED_VARIABLE(tmp);
|
830
|
+
c.v = pmadd(a,b.v,c.v);
|
831
|
+
#else
|
832
|
+
tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
|
833
|
+
#endif
|
834
|
+
|
835
|
+
}
|
836
|
+
|
837
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
|
838
|
+
{
|
839
|
+
c += a * b;
|
840
|
+
}
|
841
|
+
|
842
|
+
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
843
|
+
{
|
844
|
+
r = cj.pmadd(alpha,c,r);
|
845
|
+
}
|
846
|
+
|
847
|
+
protected:
|
848
|
+
conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
|
849
|
+
};
|
850
|
+
|
851
|
+
/* optimized GEneral packed Block * packed Panel product kernel
|
852
|
+
*
|
853
|
+
* Mixing type logic: C += A * B
|
854
|
+
* | A | B | comments
|
855
|
+
* |real |cplx | no vectorization yet, would require to pack A with duplication
|
856
|
+
* |cplx |real | easy vectorization
|
857
|
+
*/
|
858
|
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
859
|
+
struct gebp_kernel
|
860
|
+
{
|
861
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
|
862
|
+
typedef typename Traits::ResScalar ResScalar;
|
863
|
+
typedef typename Traits::LhsPacket LhsPacket;
|
864
|
+
typedef typename Traits::RhsPacket RhsPacket;
|
865
|
+
typedef typename Traits::ResPacket ResPacket;
|
866
|
+
typedef typename Traits::AccPacket AccPacket;
|
867
|
+
|
868
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
|
869
|
+
typedef typename SwappedTraits::ResScalar SResScalar;
|
870
|
+
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
871
|
+
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
872
|
+
typedef typename SwappedTraits::ResPacket SResPacket;
|
873
|
+
typedef typename SwappedTraits::AccPacket SAccPacket;
|
874
|
+
|
875
|
+
typedef typename DataMapper::LinearMapper LinearMapper;
|
876
|
+
|
877
|
+
enum {
|
878
|
+
Vectorizable = Traits::Vectorizable,
|
879
|
+
LhsProgress = Traits::LhsProgress,
|
880
|
+
RhsProgress = Traits::RhsProgress,
|
881
|
+
ResPacketSize = Traits::ResPacketSize
|
882
|
+
};
|
883
|
+
|
884
|
+
EIGEN_DONT_INLINE
|
885
|
+
void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
886
|
+
Index rows, Index depth, Index cols, ResScalar alpha,
|
887
|
+
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
|
888
|
+
};
|
889
|
+
|
890
|
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
891
|
+
EIGEN_DONT_INLINE
|
892
|
+
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
|
893
|
+
::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
894
|
+
Index rows, Index depth, Index cols, ResScalar alpha,
|
895
|
+
Index strideA, Index strideB, Index offsetA, Index offsetB)
|
896
|
+
{
|
897
|
+
Traits traits;
|
898
|
+
SwappedTraits straits;
|
899
|
+
|
900
|
+
if(strideA==-1) strideA = depth;
|
901
|
+
if(strideB==-1) strideB = depth;
|
902
|
+
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
903
|
+
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
904
|
+
const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
|
905
|
+
const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
|
906
|
+
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
|
907
|
+
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
|
908
|
+
const Index peeled_kc = depth & ~(pk-1);
|
909
|
+
const Index prefetch_res_offset = 32/sizeof(ResScalar);
|
910
|
+
// const Index depth2 = depth & ~1;
|
911
|
+
|
912
|
+
//---------- Process 3 * LhsProgress rows at once ----------
|
913
|
+
// This corresponds to 3*LhsProgress x nr register blocks.
|
914
|
+
// Usually, make sense only with FMA
|
915
|
+
if(mr>=3*Traits::LhsProgress)
|
916
|
+
{
|
917
|
+
// Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
|
918
|
+
// and on each largest micro vertical panel of the rhs (depth * nr).
|
919
|
+
// Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
|
920
|
+
// However, if depth is too small, we can extend the number of rows of these horizontal panels.
|
921
|
+
// This actual number of rows is computed as follow:
|
922
|
+
const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
|
923
|
+
// The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
|
924
|
+
// suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
|
925
|
+
// or because we are testing specific blocking sizes.
|
926
|
+
const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
|
927
|
+
for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
|
928
|
+
{
|
929
|
+
const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
|
930
|
+
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
931
|
+
{
|
932
|
+
for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
|
933
|
+
{
|
934
|
+
|
935
|
+
// We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
|
936
|
+
// stored into 3 x nr registers.
|
937
|
+
|
938
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
|
939
|
+
prefetch(&blA[0]);
|
940
|
+
|
941
|
+
// gets res block as register
|
942
|
+
AccPacket C0, C1, C2, C3,
|
943
|
+
C4, C5, C6, C7,
|
944
|
+
C8, C9, C10, C11;
|
945
|
+
traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
|
946
|
+
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
|
947
|
+
traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
|
948
|
+
|
949
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
950
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
951
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
952
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
953
|
+
|
954
|
+
r0.prefetch(0);
|
955
|
+
r1.prefetch(0);
|
956
|
+
r2.prefetch(0);
|
957
|
+
r3.prefetch(0);
|
958
|
+
|
959
|
+
// performs "inner" products
|
960
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
961
|
+
prefetch(&blB[0]);
|
962
|
+
LhsPacket A0, A1;
|
963
|
+
|
964
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
965
|
+
{
|
966
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
|
967
|
+
RhsPacket B_0, T0;
|
968
|
+
LhsPacket A2;
|
969
|
+
|
970
|
+
#define EIGEN_GEBP_ONESTEP(K) \
|
971
|
+
do { \
|
972
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
|
973
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
974
|
+
internal::prefetch(blA+(3*K+16)*LhsProgress); \
|
975
|
+
if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
|
976
|
+
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
|
977
|
+
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
|
978
|
+
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
|
979
|
+
traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
|
980
|
+
traits.madd(A0, B_0, C0, T0); \
|
981
|
+
traits.madd(A1, B_0, C4, T0); \
|
982
|
+
traits.madd(A2, B_0, C8, B_0); \
|
983
|
+
traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
|
984
|
+
traits.madd(A0, B_0, C1, T0); \
|
985
|
+
traits.madd(A1, B_0, C5, T0); \
|
986
|
+
traits.madd(A2, B_0, C9, B_0); \
|
987
|
+
traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
|
988
|
+
traits.madd(A0, B_0, C2, T0); \
|
989
|
+
traits.madd(A1, B_0, C6, T0); \
|
990
|
+
traits.madd(A2, B_0, C10, B_0); \
|
991
|
+
traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
|
992
|
+
traits.madd(A0, B_0, C3 , T0); \
|
993
|
+
traits.madd(A1, B_0, C7, T0); \
|
994
|
+
traits.madd(A2, B_0, C11, B_0); \
|
995
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
|
996
|
+
} while(false)
|
997
|
+
|
998
|
+
internal::prefetch(blB);
|
999
|
+
EIGEN_GEBP_ONESTEP(0);
|
1000
|
+
EIGEN_GEBP_ONESTEP(1);
|
1001
|
+
EIGEN_GEBP_ONESTEP(2);
|
1002
|
+
EIGEN_GEBP_ONESTEP(3);
|
1003
|
+
EIGEN_GEBP_ONESTEP(4);
|
1004
|
+
EIGEN_GEBP_ONESTEP(5);
|
1005
|
+
EIGEN_GEBP_ONESTEP(6);
|
1006
|
+
EIGEN_GEBP_ONESTEP(7);
|
1007
|
+
|
1008
|
+
blB += pk*4*RhsProgress;
|
1009
|
+
blA += pk*3*Traits::LhsProgress;
|
1010
|
+
|
1011
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
|
1012
|
+
}
|
1013
|
+
// process remaining peeled loop
|
1014
|
+
for(Index k=peeled_kc; k<depth; k++)
|
1015
|
+
{
|
1016
|
+
RhsPacket B_0, T0;
|
1017
|
+
LhsPacket A2;
|
1018
|
+
EIGEN_GEBP_ONESTEP(0);
|
1019
|
+
blB += 4*RhsProgress;
|
1020
|
+
blA += 3*Traits::LhsProgress;
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
#undef EIGEN_GEBP_ONESTEP
|
1024
|
+
|
1025
|
+
ResPacket R0, R1, R2;
|
1026
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
1027
|
+
|
1028
|
+
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
1029
|
+
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
1030
|
+
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
1031
|
+
traits.acc(C0, alphav, R0);
|
1032
|
+
traits.acc(C4, alphav, R1);
|
1033
|
+
traits.acc(C8, alphav, R2);
|
1034
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
1035
|
+
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
1036
|
+
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
1037
|
+
|
1038
|
+
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
|
1039
|
+
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
|
1040
|
+
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
|
1041
|
+
traits.acc(C1, alphav, R0);
|
1042
|
+
traits.acc(C5, alphav, R1);
|
1043
|
+
traits.acc(C9, alphav, R2);
|
1044
|
+
r1.storePacket(0 * Traits::ResPacketSize, R0);
|
1045
|
+
r1.storePacket(1 * Traits::ResPacketSize, R1);
|
1046
|
+
r1.storePacket(2 * Traits::ResPacketSize, R2);
|
1047
|
+
|
1048
|
+
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
1049
|
+
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
1050
|
+
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
|
1051
|
+
traits.acc(C2, alphav, R0);
|
1052
|
+
traits.acc(C6, alphav, R1);
|
1053
|
+
traits.acc(C10, alphav, R2);
|
1054
|
+
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
1055
|
+
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
1056
|
+
r2.storePacket(2 * Traits::ResPacketSize, R2);
|
1057
|
+
|
1058
|
+
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
|
1059
|
+
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
|
1060
|
+
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
|
1061
|
+
traits.acc(C3, alphav, R0);
|
1062
|
+
traits.acc(C7, alphav, R1);
|
1063
|
+
traits.acc(C11, alphav, R2);
|
1064
|
+
r3.storePacket(0 * Traits::ResPacketSize, R0);
|
1065
|
+
r3.storePacket(1 * Traits::ResPacketSize, R1);
|
1066
|
+
r3.storePacket(2 * Traits::ResPacketSize, R2);
|
1067
|
+
}
|
1068
|
+
}
|
1069
|
+
|
1070
|
+
// Deal with remaining columns of the rhs
|
1071
|
+
for(Index j2=packet_cols4; j2<cols; j2++)
|
1072
|
+
{
|
1073
|
+
for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
|
1074
|
+
{
|
1075
|
+
// One column at a time
|
1076
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
|
1077
|
+
prefetch(&blA[0]);
|
1078
|
+
|
1079
|
+
// gets res block as register
|
1080
|
+
AccPacket C0, C4, C8;
|
1081
|
+
traits.initAcc(C0);
|
1082
|
+
traits.initAcc(C4);
|
1083
|
+
traits.initAcc(C8);
|
1084
|
+
|
1085
|
+
LinearMapper r0 = res.getLinearMapper(i, j2);
|
1086
|
+
r0.prefetch(0);
|
1087
|
+
|
1088
|
+
// performs "inner" products
|
1089
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
1090
|
+
LhsPacket A0, A1, A2;
|
1091
|
+
|
1092
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
1093
|
+
{
|
1094
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
|
1095
|
+
RhsPacket B_0;
|
1096
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
1097
|
+
do { \
|
1098
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
|
1099
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
1100
|
+
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
|
1101
|
+
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
|
1102
|
+
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
|
1103
|
+
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
1104
|
+
traits.madd(A0, B_0, C0, B_0); \
|
1105
|
+
traits.madd(A1, B_0, C4, B_0); \
|
1106
|
+
traits.madd(A2, B_0, C8, B_0); \
|
1107
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
|
1108
|
+
} while(false)
|
1109
|
+
|
1110
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1111
|
+
EIGEN_GEBGP_ONESTEP(1);
|
1112
|
+
EIGEN_GEBGP_ONESTEP(2);
|
1113
|
+
EIGEN_GEBGP_ONESTEP(3);
|
1114
|
+
EIGEN_GEBGP_ONESTEP(4);
|
1115
|
+
EIGEN_GEBGP_ONESTEP(5);
|
1116
|
+
EIGEN_GEBGP_ONESTEP(6);
|
1117
|
+
EIGEN_GEBGP_ONESTEP(7);
|
1118
|
+
|
1119
|
+
blB += pk*RhsProgress;
|
1120
|
+
blA += pk*3*Traits::LhsProgress;
|
1121
|
+
|
1122
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
|
1123
|
+
}
|
1124
|
+
|
1125
|
+
// process remaining peeled loop
|
1126
|
+
for(Index k=peeled_kc; k<depth; k++)
|
1127
|
+
{
|
1128
|
+
RhsPacket B_0;
|
1129
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1130
|
+
blB += RhsProgress;
|
1131
|
+
blA += 3*Traits::LhsProgress;
|
1132
|
+
}
|
1133
|
+
#undef EIGEN_GEBGP_ONESTEP
|
1134
|
+
ResPacket R0, R1, R2;
|
1135
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
1136
|
+
|
1137
|
+
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
1138
|
+
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
1139
|
+
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
1140
|
+
traits.acc(C0, alphav, R0);
|
1141
|
+
traits.acc(C4, alphav, R1);
|
1142
|
+
traits.acc(C8, alphav, R2);
|
1143
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
1144
|
+
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
1145
|
+
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
1146
|
+
}
|
1147
|
+
}
|
1148
|
+
}
|
1149
|
+
}
|
1150
|
+
|
1151
|
+
//---------- Process 2 * LhsProgress rows at once ----------
|
1152
|
+
if(mr>=2*Traits::LhsProgress)
|
1153
|
+
{
|
1154
|
+
const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
|
1155
|
+
// The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
|
1156
|
+
// suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
|
1157
|
+
// or because we are testing specific blocking sizes.
|
1158
|
+
Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
|
1159
|
+
|
1160
|
+
for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
|
1161
|
+
{
|
1162
|
+
Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
|
1163
|
+
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
1164
|
+
{
|
1165
|
+
for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
|
1166
|
+
{
|
1167
|
+
|
1168
|
+
// We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
|
1169
|
+
// stored into 2 x nr registers.
|
1170
|
+
|
1171
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
|
1172
|
+
prefetch(&blA[0]);
|
1173
|
+
|
1174
|
+
// gets res block as register
|
1175
|
+
AccPacket C0, C1, C2, C3,
|
1176
|
+
C4, C5, C6, C7;
|
1177
|
+
traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
|
1178
|
+
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
|
1179
|
+
|
1180
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
1181
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
1182
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
1183
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
1184
|
+
|
1185
|
+
r0.prefetch(prefetch_res_offset);
|
1186
|
+
r1.prefetch(prefetch_res_offset);
|
1187
|
+
r2.prefetch(prefetch_res_offset);
|
1188
|
+
r3.prefetch(prefetch_res_offset);
|
1189
|
+
|
1190
|
+
// performs "inner" products
|
1191
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
1192
|
+
prefetch(&blB[0]);
|
1193
|
+
LhsPacket A0, A1;
|
1194
|
+
|
1195
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
1196
|
+
{
|
1197
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
|
1198
|
+
RhsPacket B_0, B1, B2, B3, T0;
|
1199
|
+
|
1200
|
+
// NOTE: the begin/end asm comments below work around bug 935!
|
1201
|
+
// but they are not enough for gcc>=6 without FMA (bug 1637)
|
1202
|
+
#if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
|
1203
|
+
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
|
1204
|
+
#else
|
1205
|
+
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
1206
|
+
#endif
|
1207
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
1208
|
+
do { \
|
1209
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
|
1210
|
+
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
|
1211
|
+
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
|
1212
|
+
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
|
1213
|
+
traits.madd(A0, B_0, C0, T0); \
|
1214
|
+
traits.madd(A1, B_0, C4, B_0); \
|
1215
|
+
traits.madd(A0, B1, C1, T0); \
|
1216
|
+
traits.madd(A1, B1, C5, B1); \
|
1217
|
+
traits.madd(A0, B2, C2, T0); \
|
1218
|
+
traits.madd(A1, B2, C6, B2); \
|
1219
|
+
traits.madd(A0, B3, C3, T0); \
|
1220
|
+
traits.madd(A1, B3, C7, B3); \
|
1221
|
+
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
|
1222
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
|
1223
|
+
} while(false)
|
1224
|
+
|
1225
|
+
internal::prefetch(blB+(48+0));
|
1226
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1227
|
+
EIGEN_GEBGP_ONESTEP(1);
|
1228
|
+
EIGEN_GEBGP_ONESTEP(2);
|
1229
|
+
EIGEN_GEBGP_ONESTEP(3);
|
1230
|
+
internal::prefetch(blB+(48+16));
|
1231
|
+
EIGEN_GEBGP_ONESTEP(4);
|
1232
|
+
EIGEN_GEBGP_ONESTEP(5);
|
1233
|
+
EIGEN_GEBGP_ONESTEP(6);
|
1234
|
+
EIGEN_GEBGP_ONESTEP(7);
|
1235
|
+
|
1236
|
+
blB += pk*4*RhsProgress;
|
1237
|
+
blA += pk*(2*Traits::LhsProgress);
|
1238
|
+
|
1239
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
|
1240
|
+
}
|
1241
|
+
// process remaining peeled loop
|
1242
|
+
for(Index k=peeled_kc; k<depth; k++)
|
1243
|
+
{
|
1244
|
+
RhsPacket B_0, B1, B2, B3, T0;
|
1245
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1246
|
+
blB += 4*RhsProgress;
|
1247
|
+
blA += 2*Traits::LhsProgress;
|
1248
|
+
}
|
1249
|
+
#undef EIGEN_GEBGP_ONESTEP
|
1250
|
+
|
1251
|
+
ResPacket R0, R1, R2, R3;
|
1252
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
1253
|
+
|
1254
|
+
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
1255
|
+
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
1256
|
+
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
|
1257
|
+
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
|
1258
|
+
traits.acc(C0, alphav, R0);
|
1259
|
+
traits.acc(C4, alphav, R1);
|
1260
|
+
traits.acc(C1, alphav, R2);
|
1261
|
+
traits.acc(C5, alphav, R3);
|
1262
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
1263
|
+
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
1264
|
+
r1.storePacket(0 * Traits::ResPacketSize, R2);
|
1265
|
+
r1.storePacket(1 * Traits::ResPacketSize, R3);
|
1266
|
+
|
1267
|
+
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
1268
|
+
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
1269
|
+
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
|
1270
|
+
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
|
1271
|
+
traits.acc(C2, alphav, R0);
|
1272
|
+
traits.acc(C6, alphav, R1);
|
1273
|
+
traits.acc(C3, alphav, R2);
|
1274
|
+
traits.acc(C7, alphav, R3);
|
1275
|
+
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
1276
|
+
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
1277
|
+
r3.storePacket(0 * Traits::ResPacketSize, R2);
|
1278
|
+
r3.storePacket(1 * Traits::ResPacketSize, R3);
|
1279
|
+
}
|
1280
|
+
}
|
1281
|
+
|
1282
|
+
// Deal with remaining columns of the rhs
|
1283
|
+
for(Index j2=packet_cols4; j2<cols; j2++)
|
1284
|
+
{
|
1285
|
+
for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
|
1286
|
+
{
|
1287
|
+
// One column at a time
|
1288
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
|
1289
|
+
prefetch(&blA[0]);
|
1290
|
+
|
1291
|
+
// gets res block as register
|
1292
|
+
AccPacket C0, C4;
|
1293
|
+
traits.initAcc(C0);
|
1294
|
+
traits.initAcc(C4);
|
1295
|
+
|
1296
|
+
LinearMapper r0 = res.getLinearMapper(i, j2);
|
1297
|
+
r0.prefetch(prefetch_res_offset);
|
1298
|
+
|
1299
|
+
// performs "inner" products
|
1300
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
1301
|
+
LhsPacket A0, A1;
|
1302
|
+
|
1303
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
1304
|
+
{
|
1305
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
|
1306
|
+
RhsPacket B_0, B1;
|
1307
|
+
|
1308
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
1309
|
+
do { \
|
1310
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
|
1311
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
1312
|
+
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
|
1313
|
+
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
|
1314
|
+
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
1315
|
+
traits.madd(A0, B_0, C0, B1); \
|
1316
|
+
traits.madd(A1, B_0, C4, B_0); \
|
1317
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
|
1318
|
+
} while(false)
|
1319
|
+
|
1320
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1321
|
+
EIGEN_GEBGP_ONESTEP(1);
|
1322
|
+
EIGEN_GEBGP_ONESTEP(2);
|
1323
|
+
EIGEN_GEBGP_ONESTEP(3);
|
1324
|
+
EIGEN_GEBGP_ONESTEP(4);
|
1325
|
+
EIGEN_GEBGP_ONESTEP(5);
|
1326
|
+
EIGEN_GEBGP_ONESTEP(6);
|
1327
|
+
EIGEN_GEBGP_ONESTEP(7);
|
1328
|
+
|
1329
|
+
blB += pk*RhsProgress;
|
1330
|
+
blA += pk*2*Traits::LhsProgress;
|
1331
|
+
|
1332
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
|
1333
|
+
}
|
1334
|
+
|
1335
|
+
// process remaining peeled loop
|
1336
|
+
for(Index k=peeled_kc; k<depth; k++)
|
1337
|
+
{
|
1338
|
+
RhsPacket B_0, B1;
|
1339
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1340
|
+
blB += RhsProgress;
|
1341
|
+
blA += 2*Traits::LhsProgress;
|
1342
|
+
}
|
1343
|
+
#undef EIGEN_GEBGP_ONESTEP
|
1344
|
+
ResPacket R0, R1;
|
1345
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
1346
|
+
|
1347
|
+
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
1348
|
+
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
1349
|
+
traits.acc(C0, alphav, R0);
|
1350
|
+
traits.acc(C4, alphav, R1);
|
1351
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
1352
|
+
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
}
|
1356
|
+
}
|
1357
|
+
//---------- Process 1 * LhsProgress rows at once ----------
|
1358
|
+
if(mr>=1*Traits::LhsProgress)
|
1359
|
+
{
|
1360
|
+
// loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
|
1361
|
+
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
|
1362
|
+
{
|
1363
|
+
// loops on each largest micro vertical panel of rhs (depth * nr)
|
1364
|
+
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
1365
|
+
{
|
1366
|
+
// We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
|
1367
|
+
// stored into 1 x nr registers.
|
1368
|
+
|
1369
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
|
1370
|
+
prefetch(&blA[0]);
|
1371
|
+
|
1372
|
+
// gets res block as register
|
1373
|
+
AccPacket C0, C1, C2, C3;
|
1374
|
+
traits.initAcc(C0);
|
1375
|
+
traits.initAcc(C1);
|
1376
|
+
traits.initAcc(C2);
|
1377
|
+
traits.initAcc(C3);
|
1378
|
+
|
1379
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
1380
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
1381
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
1382
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
1383
|
+
|
1384
|
+
r0.prefetch(prefetch_res_offset);
|
1385
|
+
r1.prefetch(prefetch_res_offset);
|
1386
|
+
r2.prefetch(prefetch_res_offset);
|
1387
|
+
r3.prefetch(prefetch_res_offset);
|
1388
|
+
|
1389
|
+
// performs "inner" products
|
1390
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
1391
|
+
prefetch(&blB[0]);
|
1392
|
+
LhsPacket A0;
|
1393
|
+
|
1394
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
1395
|
+
{
|
1396
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
|
1397
|
+
RhsPacket B_0, B1, B2, B3;
|
1398
|
+
|
1399
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
1400
|
+
do { \
|
1401
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
|
1402
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
1403
|
+
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
|
1404
|
+
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
|
1405
|
+
traits.madd(A0, B_0, C0, B_0); \
|
1406
|
+
traits.madd(A0, B1, C1, B1); \
|
1407
|
+
traits.madd(A0, B2, C2, B2); \
|
1408
|
+
traits.madd(A0, B3, C3, B3); \
|
1409
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
|
1410
|
+
} while(false)
|
1411
|
+
|
1412
|
+
internal::prefetch(blB+(48+0));
|
1413
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1414
|
+
EIGEN_GEBGP_ONESTEP(1);
|
1415
|
+
EIGEN_GEBGP_ONESTEP(2);
|
1416
|
+
EIGEN_GEBGP_ONESTEP(3);
|
1417
|
+
internal::prefetch(blB+(48+16));
|
1418
|
+
EIGEN_GEBGP_ONESTEP(4);
|
1419
|
+
EIGEN_GEBGP_ONESTEP(5);
|
1420
|
+
EIGEN_GEBGP_ONESTEP(6);
|
1421
|
+
EIGEN_GEBGP_ONESTEP(7);
|
1422
|
+
|
1423
|
+
blB += pk*4*RhsProgress;
|
1424
|
+
blA += pk*1*LhsProgress;
|
1425
|
+
|
1426
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
|
1427
|
+
}
|
1428
|
+
// process remaining peeled loop
|
1429
|
+
for(Index k=peeled_kc; k<depth; k++)
|
1430
|
+
{
|
1431
|
+
RhsPacket B_0, B1, B2, B3;
|
1432
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1433
|
+
blB += 4*RhsProgress;
|
1434
|
+
blA += 1*LhsProgress;
|
1435
|
+
}
|
1436
|
+
#undef EIGEN_GEBGP_ONESTEP
|
1437
|
+
|
1438
|
+
ResPacket R0, R1;
|
1439
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
1440
|
+
|
1441
|
+
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
1442
|
+
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
|
1443
|
+
traits.acc(C0, alphav, R0);
|
1444
|
+
traits.acc(C1, alphav, R1);
|
1445
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
1446
|
+
r1.storePacket(0 * Traits::ResPacketSize, R1);
|
1447
|
+
|
1448
|
+
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
1449
|
+
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
|
1450
|
+
traits.acc(C2, alphav, R0);
|
1451
|
+
traits.acc(C3, alphav, R1);
|
1452
|
+
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
1453
|
+
r3.storePacket(0 * Traits::ResPacketSize, R1);
|
1454
|
+
}
|
1455
|
+
|
1456
|
+
// Deal with remaining columns of the rhs
|
1457
|
+
for(Index j2=packet_cols4; j2<cols; j2++)
|
1458
|
+
{
|
1459
|
+
// One column at a time
|
1460
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
|
1461
|
+
prefetch(&blA[0]);
|
1462
|
+
|
1463
|
+
// gets res block as register
|
1464
|
+
AccPacket C0;
|
1465
|
+
traits.initAcc(C0);
|
1466
|
+
|
1467
|
+
LinearMapper r0 = res.getLinearMapper(i, j2);
|
1468
|
+
|
1469
|
+
// performs "inner" products
|
1470
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
1471
|
+
LhsPacket A0;
|
1472
|
+
|
1473
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
1474
|
+
{
|
1475
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
|
1476
|
+
RhsPacket B_0;
|
1477
|
+
|
1478
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
1479
|
+
do { \
|
1480
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
|
1481
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
1482
|
+
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
|
1483
|
+
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
1484
|
+
traits.madd(A0, B_0, C0, B_0); \
|
1485
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
|
1486
|
+
} while(false);
|
1487
|
+
|
1488
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1489
|
+
EIGEN_GEBGP_ONESTEP(1);
|
1490
|
+
EIGEN_GEBGP_ONESTEP(2);
|
1491
|
+
EIGEN_GEBGP_ONESTEP(3);
|
1492
|
+
EIGEN_GEBGP_ONESTEP(4);
|
1493
|
+
EIGEN_GEBGP_ONESTEP(5);
|
1494
|
+
EIGEN_GEBGP_ONESTEP(6);
|
1495
|
+
EIGEN_GEBGP_ONESTEP(7);
|
1496
|
+
|
1497
|
+
blB += pk*RhsProgress;
|
1498
|
+
blA += pk*1*Traits::LhsProgress;
|
1499
|
+
|
1500
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
|
1501
|
+
}
|
1502
|
+
|
1503
|
+
// process remaining peeled loop
|
1504
|
+
for(Index k=peeled_kc; k<depth; k++)
|
1505
|
+
{
|
1506
|
+
RhsPacket B_0;
|
1507
|
+
EIGEN_GEBGP_ONESTEP(0);
|
1508
|
+
blB += RhsProgress;
|
1509
|
+
blA += 1*Traits::LhsProgress;
|
1510
|
+
}
|
1511
|
+
#undef EIGEN_GEBGP_ONESTEP
|
1512
|
+
ResPacket R0;
|
1513
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
1514
|
+
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
1515
|
+
traits.acc(C0, alphav, R0);
|
1516
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
1517
|
+
}
|
1518
|
+
}
|
1519
|
+
}
|
1520
|
+
//---------- Process remaining rows, 1 at once ----------
|
1521
|
+
if(peeled_mc1<rows)
|
1522
|
+
{
|
1523
|
+
// loop on each panel of the rhs
|
1524
|
+
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
1525
|
+
{
|
1526
|
+
// loop on each row of the lhs (1*LhsProgress x depth)
|
1527
|
+
for(Index i=peeled_mc1; i<rows; i+=1)
|
1528
|
+
{
|
1529
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
1530
|
+
prefetch(&blA[0]);
|
1531
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
1532
|
+
|
1533
|
+
// The following piece of code wont work for 512 bit registers
|
1534
|
+
// Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
|
1535
|
+
// as nr (which is currently 4) for the return type.
|
1536
|
+
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
1537
|
+
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
1538
|
+
(SwappedTraits::LhsProgress <= 8) &&
|
1539
|
+
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
|
1540
|
+
{
|
1541
|
+
SAccPacket C0, C1, C2, C3;
|
1542
|
+
straits.initAcc(C0);
|
1543
|
+
straits.initAcc(C1);
|
1544
|
+
straits.initAcc(C2);
|
1545
|
+
straits.initAcc(C3);
|
1546
|
+
|
1547
|
+
const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
|
1548
|
+
const Index endk = (depth/spk)*spk;
|
1549
|
+
const Index endk4 = (depth/(spk*4))*(spk*4);
|
1550
|
+
|
1551
|
+
Index k=0;
|
1552
|
+
for(; k<endk4; k+=4*spk)
|
1553
|
+
{
|
1554
|
+
SLhsPacket A0,A1;
|
1555
|
+
SRhsPacket B_0,B_1;
|
1556
|
+
|
1557
|
+
straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
|
1558
|
+
straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
|
1559
|
+
|
1560
|
+
straits.loadRhsQuad(blA+0*spk, B_0);
|
1561
|
+
straits.loadRhsQuad(blA+1*spk, B_1);
|
1562
|
+
straits.madd(A0,B_0,C0,B_0);
|
1563
|
+
straits.madd(A1,B_1,C1,B_1);
|
1564
|
+
|
1565
|
+
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
|
1566
|
+
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
|
1567
|
+
straits.loadRhsQuad(blA+2*spk, B_0);
|
1568
|
+
straits.loadRhsQuad(blA+3*spk, B_1);
|
1569
|
+
straits.madd(A0,B_0,C2,B_0);
|
1570
|
+
straits.madd(A1,B_1,C3,B_1);
|
1571
|
+
|
1572
|
+
blB += 4*SwappedTraits::LhsProgress;
|
1573
|
+
blA += 4*spk;
|
1574
|
+
}
|
1575
|
+
C0 = padd(padd(C0,C1),padd(C2,C3));
|
1576
|
+
for(; k<endk; k+=spk)
|
1577
|
+
{
|
1578
|
+
SLhsPacket A0;
|
1579
|
+
SRhsPacket B_0;
|
1580
|
+
|
1581
|
+
straits.loadLhsUnaligned(blB, A0);
|
1582
|
+
straits.loadRhsQuad(blA, B_0);
|
1583
|
+
straits.madd(A0,B_0,C0,B_0);
|
1584
|
+
|
1585
|
+
blB += SwappedTraits::LhsProgress;
|
1586
|
+
blA += spk;
|
1587
|
+
}
|
1588
|
+
if(SwappedTraits::LhsProgress==8)
|
1589
|
+
{
|
1590
|
+
// Special case where we have to first reduce the accumulation register C0
|
1591
|
+
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
|
1592
|
+
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
|
1593
|
+
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
|
1594
|
+
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
|
1595
|
+
|
1596
|
+
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
|
1597
|
+
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
|
1598
|
+
|
1599
|
+
if(depth-endk>0)
|
1600
|
+
{
|
1601
|
+
// We have to handle the last row of the rhs which corresponds to a half-packet
|
1602
|
+
SLhsPacketHalf a0;
|
1603
|
+
SRhsPacketHalf b0;
|
1604
|
+
straits.loadLhsUnaligned(blB, a0);
|
1605
|
+
straits.loadRhs(blA, b0);
|
1606
|
+
SAccPacketHalf c0 = predux_downto4(C0);
|
1607
|
+
straits.madd(a0,b0,c0,b0);
|
1608
|
+
straits.acc(c0, alphav, R);
|
1609
|
+
}
|
1610
|
+
else
|
1611
|
+
{
|
1612
|
+
straits.acc(predux_downto4(C0), alphav, R);
|
1613
|
+
}
|
1614
|
+
res.scatterPacket(i, j2, R);
|
1615
|
+
}
|
1616
|
+
else
|
1617
|
+
{
|
1618
|
+
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
|
1619
|
+
SResPacket alphav = pset1<SResPacket>(alpha);
|
1620
|
+
straits.acc(C0, alphav, R);
|
1621
|
+
res.scatterPacket(i, j2, R);
|
1622
|
+
}
|
1623
|
+
}
|
1624
|
+
else // scalar path
|
1625
|
+
{
|
1626
|
+
// get a 1 x 4 res block as registers
|
1627
|
+
ResScalar C0(0), C1(0), C2(0), C3(0);
|
1628
|
+
|
1629
|
+
for(Index k=0; k<depth; k++)
|
1630
|
+
{
|
1631
|
+
LhsScalar A0;
|
1632
|
+
RhsScalar B_0, B_1;
|
1633
|
+
|
1634
|
+
A0 = blA[k];
|
1635
|
+
|
1636
|
+
B_0 = blB[0];
|
1637
|
+
B_1 = blB[1];
|
1638
|
+
CJMADD(cj,A0,B_0,C0, B_0);
|
1639
|
+
CJMADD(cj,A0,B_1,C1, B_1);
|
1640
|
+
|
1641
|
+
B_0 = blB[2];
|
1642
|
+
B_1 = blB[3];
|
1643
|
+
CJMADD(cj,A0,B_0,C2, B_0);
|
1644
|
+
CJMADD(cj,A0,B_1,C3, B_1);
|
1645
|
+
|
1646
|
+
blB += 4;
|
1647
|
+
}
|
1648
|
+
res(i, j2 + 0) += alpha * C0;
|
1649
|
+
res(i, j2 + 1) += alpha * C1;
|
1650
|
+
res(i, j2 + 2) += alpha * C2;
|
1651
|
+
res(i, j2 + 3) += alpha * C3;
|
1652
|
+
}
|
1653
|
+
}
|
1654
|
+
}
|
1655
|
+
// remaining columns
|
1656
|
+
for(Index j2=packet_cols4; j2<cols; j2++)
|
1657
|
+
{
|
1658
|
+
// loop on each row of the lhs (1*LhsProgress x depth)
|
1659
|
+
for(Index i=peeled_mc1; i<rows; i+=1)
|
1660
|
+
{
|
1661
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
1662
|
+
prefetch(&blA[0]);
|
1663
|
+
// gets a 1 x 1 res block as registers
|
1664
|
+
ResScalar C0(0);
|
1665
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
1666
|
+
for(Index k=0; k<depth; k++)
|
1667
|
+
{
|
1668
|
+
LhsScalar A0 = blA[k];
|
1669
|
+
RhsScalar B_0 = blB[k];
|
1670
|
+
CJMADD(cj, A0, B_0, C0, B_0);
|
1671
|
+
}
|
1672
|
+
res(i, j2) += alpha * C0;
|
1673
|
+
}
|
1674
|
+
}
|
1675
|
+
}
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
|
1679
|
+
#undef CJMADD
|
1680
|
+
|
1681
|
+
// pack a block of the lhs
|
1682
|
+
// The traversal is as follow (mr==4):
|
1683
|
+
// 0 4 8 12 ...
|
1684
|
+
// 1 5 9 13 ...
|
1685
|
+
// 2 6 10 14 ...
|
1686
|
+
// 3 7 11 15 ...
|
1687
|
+
//
|
1688
|
+
// 16 20 24 28 ...
|
1689
|
+
// 17 21 25 29 ...
|
1690
|
+
// 18 22 26 30 ...
|
1691
|
+
// 19 23 27 31 ...
|
1692
|
+
//
|
1693
|
+
// 32 33 34 35 ...
|
1694
|
+
// 36 36 38 39 ...
|
1695
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
1696
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
1697
|
+
{
|
1698
|
+
typedef typename DataMapper::LinearMapper LinearMapper;
|
1699
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
1700
|
+
};
|
1701
|
+
|
1702
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
1703
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
1704
|
+
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
1705
|
+
{
|
1706
|
+
typedef typename packet_traits<Scalar>::type Packet;
|
1707
|
+
enum { PacketSize = packet_traits<Scalar>::size };
|
1708
|
+
|
1709
|
+
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
1710
|
+
EIGEN_UNUSED_VARIABLE(stride);
|
1711
|
+
EIGEN_UNUSED_VARIABLE(offset);
|
1712
|
+
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
1713
|
+
eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
|
1714
|
+
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
1715
|
+
Index count = 0;
|
1716
|
+
|
1717
|
+
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
1718
|
+
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
1719
|
+
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
1720
|
+
const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
|
1721
|
+
: Pack2>1 ? (rows/Pack2)*Pack2 : 0;
|
1722
|
+
|
1723
|
+
Index i=0;
|
1724
|
+
|
1725
|
+
// Pack 3 packets
|
1726
|
+
if(Pack1>=3*PacketSize)
|
1727
|
+
{
|
1728
|
+
for(; i<peeled_mc3; i+=3*PacketSize)
|
1729
|
+
{
|
1730
|
+
if(PanelMode) count += (3*PacketSize) * offset;
|
1731
|
+
|
1732
|
+
for(Index k=0; k<depth; k++)
|
1733
|
+
{
|
1734
|
+
Packet A, B, C;
|
1735
|
+
A = lhs.loadPacket(i+0*PacketSize, k);
|
1736
|
+
B = lhs.loadPacket(i+1*PacketSize, k);
|
1737
|
+
C = lhs.loadPacket(i+2*PacketSize, k);
|
1738
|
+
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
1739
|
+
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
1740
|
+
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
|
1741
|
+
}
|
1742
|
+
if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
|
1743
|
+
}
|
1744
|
+
}
|
1745
|
+
// Pack 2 packets
|
1746
|
+
if(Pack1>=2*PacketSize)
|
1747
|
+
{
|
1748
|
+
for(; i<peeled_mc2; i+=2*PacketSize)
|
1749
|
+
{
|
1750
|
+
if(PanelMode) count += (2*PacketSize) * offset;
|
1751
|
+
|
1752
|
+
for(Index k=0; k<depth; k++)
|
1753
|
+
{
|
1754
|
+
Packet A, B;
|
1755
|
+
A = lhs.loadPacket(i+0*PacketSize, k);
|
1756
|
+
B = lhs.loadPacket(i+1*PacketSize, k);
|
1757
|
+
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
1758
|
+
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
1759
|
+
}
|
1760
|
+
if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
|
1761
|
+
}
|
1762
|
+
}
|
1763
|
+
// Pack 1 packets
|
1764
|
+
if(Pack1>=1*PacketSize)
|
1765
|
+
{
|
1766
|
+
for(; i<peeled_mc1; i+=1*PacketSize)
|
1767
|
+
{
|
1768
|
+
if(PanelMode) count += (1*PacketSize) * offset;
|
1769
|
+
|
1770
|
+
for(Index k=0; k<depth; k++)
|
1771
|
+
{
|
1772
|
+
Packet A;
|
1773
|
+
A = lhs.loadPacket(i+0*PacketSize, k);
|
1774
|
+
pstore(blockA+count, cj.pconj(A));
|
1775
|
+
count+=PacketSize;
|
1776
|
+
}
|
1777
|
+
if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
|
1778
|
+
}
|
1779
|
+
}
|
1780
|
+
// Pack scalars
|
1781
|
+
if(Pack2<PacketSize && Pack2>1)
|
1782
|
+
{
|
1783
|
+
for(; i<peeled_mc0; i+=Pack2)
|
1784
|
+
{
|
1785
|
+
if(PanelMode) count += Pack2 * offset;
|
1786
|
+
|
1787
|
+
for(Index k=0; k<depth; k++)
|
1788
|
+
for(Index w=0; w<Pack2; w++)
|
1789
|
+
blockA[count++] = cj(lhs(i+w, k));
|
1790
|
+
|
1791
|
+
if(PanelMode) count += Pack2 * (stride-offset-depth);
|
1792
|
+
}
|
1793
|
+
}
|
1794
|
+
for(; i<rows; i++)
|
1795
|
+
{
|
1796
|
+
if(PanelMode) count += offset;
|
1797
|
+
for(Index k=0; k<depth; k++)
|
1798
|
+
blockA[count++] = cj(lhs(i, k));
|
1799
|
+
if(PanelMode) count += (stride-offset-depth);
|
1800
|
+
}
|
1801
|
+
}
|
1802
|
+
|
1803
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
1804
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
1805
|
+
{
|
1806
|
+
typedef typename DataMapper::LinearMapper LinearMapper;
|
1807
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
1808
|
+
};
|
1809
|
+
|
1810
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
1811
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
1812
|
+
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
1813
|
+
{
|
1814
|
+
typedef typename packet_traits<Scalar>::type Packet;
|
1815
|
+
enum { PacketSize = packet_traits<Scalar>::size };
|
1816
|
+
|
1817
|
+
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
1818
|
+
EIGEN_UNUSED_VARIABLE(stride);
|
1819
|
+
EIGEN_UNUSED_VARIABLE(offset);
|
1820
|
+
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
1821
|
+
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
1822
|
+
Index count = 0;
|
1823
|
+
|
1824
|
+
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
1825
|
+
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
1826
|
+
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
1827
|
+
|
1828
|
+
int pack = Pack1;
|
1829
|
+
Index i = 0;
|
1830
|
+
while(pack>0)
|
1831
|
+
{
|
1832
|
+
Index remaining_rows = rows-i;
|
1833
|
+
Index peeled_mc = i+(remaining_rows/pack)*pack;
|
1834
|
+
for(; i<peeled_mc; i+=pack)
|
1835
|
+
{
|
1836
|
+
if(PanelMode) count += pack * offset;
|
1837
|
+
|
1838
|
+
const Index peeled_k = (depth/PacketSize)*PacketSize;
|
1839
|
+
Index k=0;
|
1840
|
+
if(pack>=PacketSize)
|
1841
|
+
{
|
1842
|
+
for(; k<peeled_k; k+=PacketSize)
|
1843
|
+
{
|
1844
|
+
for (Index m = 0; m < pack; m += PacketSize)
|
1845
|
+
{
|
1846
|
+
PacketBlock<Packet> kernel;
|
1847
|
+
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
|
1848
|
+
ptranspose(kernel);
|
1849
|
+
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
|
1850
|
+
}
|
1851
|
+
count += PacketSize*pack;
|
1852
|
+
}
|
1853
|
+
}
|
1854
|
+
for(; k<depth; k++)
|
1855
|
+
{
|
1856
|
+
Index w=0;
|
1857
|
+
for(; w<pack-3; w+=4)
|
1858
|
+
{
|
1859
|
+
Scalar a(cj(lhs(i+w+0, k))),
|
1860
|
+
b(cj(lhs(i+w+1, k))),
|
1861
|
+
c(cj(lhs(i+w+2, k))),
|
1862
|
+
d(cj(lhs(i+w+3, k)));
|
1863
|
+
blockA[count++] = a;
|
1864
|
+
blockA[count++] = b;
|
1865
|
+
blockA[count++] = c;
|
1866
|
+
blockA[count++] = d;
|
1867
|
+
}
|
1868
|
+
if(pack%4)
|
1869
|
+
for(;w<pack;++w)
|
1870
|
+
blockA[count++] = cj(lhs(i+w, k));
|
1871
|
+
}
|
1872
|
+
|
1873
|
+
if(PanelMode) count += pack * (stride-offset-depth);
|
1874
|
+
}
|
1875
|
+
|
1876
|
+
pack -= PacketSize;
|
1877
|
+
if(pack<Pack2 && (pack+PacketSize)!=Pack2)
|
1878
|
+
pack = Pack2;
|
1879
|
+
}
|
1880
|
+
|
1881
|
+
for(; i<rows; i++)
|
1882
|
+
{
|
1883
|
+
if(PanelMode) count += offset;
|
1884
|
+
for(Index k=0; k<depth; k++)
|
1885
|
+
blockA[count++] = cj(lhs(i, k));
|
1886
|
+
if(PanelMode) count += (stride-offset-depth);
|
1887
|
+
}
|
1888
|
+
}
|
1889
|
+
|
1890
|
+
// copy a complete panel of the rhs
|
1891
|
+
// this version is optimized for column major matrices
|
1892
|
+
// The traversal order is as follow: (nr==4):
|
1893
|
+
// 0 1 2 3 12 13 14 15 24 27
|
1894
|
+
// 4 5 6 7 16 17 18 19 25 28
|
1895
|
+
// 8 9 10 11 20 21 22 23 26 29
|
1896
|
+
// . . . . . . . . . .
|
1897
|
+
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
1898
|
+
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
1899
|
+
{
|
1900
|
+
typedef typename packet_traits<Scalar>::type Packet;
|
1901
|
+
typedef typename DataMapper::LinearMapper LinearMapper;
|
1902
|
+
enum { PacketSize = packet_traits<Scalar>::size };
|
1903
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
|
1904
|
+
};
|
1905
|
+
|
1906
|
+
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
1907
|
+
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
1908
|
+
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
1909
|
+
{
|
1910
|
+
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
|
1911
|
+
EIGEN_UNUSED_VARIABLE(stride);
|
1912
|
+
EIGEN_UNUSED_VARIABLE(offset);
|
1913
|
+
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
1914
|
+
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
1915
|
+
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
1916
|
+
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
1917
|
+
Index count = 0;
|
1918
|
+
const Index peeled_k = (depth/PacketSize)*PacketSize;
|
1919
|
+
// if(nr>=8)
|
1920
|
+
// {
|
1921
|
+
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
1922
|
+
// {
|
1923
|
+
// // skip what we have before
|
1924
|
+
// if(PanelMode) count += 8 * offset;
|
1925
|
+
// const Scalar* b0 = &rhs[(j2+0)*rhsStride];
|
1926
|
+
// const Scalar* b1 = &rhs[(j2+1)*rhsStride];
|
1927
|
+
// const Scalar* b2 = &rhs[(j2+2)*rhsStride];
|
1928
|
+
// const Scalar* b3 = &rhs[(j2+3)*rhsStride];
|
1929
|
+
// const Scalar* b4 = &rhs[(j2+4)*rhsStride];
|
1930
|
+
// const Scalar* b5 = &rhs[(j2+5)*rhsStride];
|
1931
|
+
// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
|
1932
|
+
// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
|
1933
|
+
// Index k=0;
|
1934
|
+
// if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
|
1935
|
+
// {
|
1936
|
+
// for(; k<peeled_k; k+=PacketSize) {
|
1937
|
+
// PacketBlock<Packet> kernel;
|
1938
|
+
// for (int p = 0; p < PacketSize; ++p) {
|
1939
|
+
// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
|
1940
|
+
// }
|
1941
|
+
// ptranspose(kernel);
|
1942
|
+
// for (int p = 0; p < PacketSize; ++p) {
|
1943
|
+
// pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
|
1944
|
+
// count+=PacketSize;
|
1945
|
+
// }
|
1946
|
+
// }
|
1947
|
+
// }
|
1948
|
+
// for(; k<depth; k++)
|
1949
|
+
// {
|
1950
|
+
// blockB[count+0] = cj(b0[k]);
|
1951
|
+
// blockB[count+1] = cj(b1[k]);
|
1952
|
+
// blockB[count+2] = cj(b2[k]);
|
1953
|
+
// blockB[count+3] = cj(b3[k]);
|
1954
|
+
// blockB[count+4] = cj(b4[k]);
|
1955
|
+
// blockB[count+5] = cj(b5[k]);
|
1956
|
+
// blockB[count+6] = cj(b6[k]);
|
1957
|
+
// blockB[count+7] = cj(b7[k]);
|
1958
|
+
// count += 8;
|
1959
|
+
// }
|
1960
|
+
// // skip what we have after
|
1961
|
+
// if(PanelMode) count += 8 * (stride-offset-depth);
|
1962
|
+
// }
|
1963
|
+
// }
|
1964
|
+
|
1965
|
+
if(nr>=4)
|
1966
|
+
{
|
1967
|
+
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
|
1968
|
+
{
|
1969
|
+
// skip what we have before
|
1970
|
+
if(PanelMode) count += 4 * offset;
|
1971
|
+
const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
|
1972
|
+
const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
|
1973
|
+
const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
|
1974
|
+
const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
|
1975
|
+
|
1976
|
+
Index k=0;
|
1977
|
+
if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
|
1978
|
+
{
|
1979
|
+
for(; k<peeled_k; k+=PacketSize) {
|
1980
|
+
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
|
1981
|
+
kernel.packet[0] = dm0.loadPacket(k);
|
1982
|
+
kernel.packet[1%PacketSize] = dm1.loadPacket(k);
|
1983
|
+
kernel.packet[2%PacketSize] = dm2.loadPacket(k);
|
1984
|
+
kernel.packet[3%PacketSize] = dm3.loadPacket(k);
|
1985
|
+
ptranspose(kernel);
|
1986
|
+
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
|
1987
|
+
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
|
1988
|
+
pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
|
1989
|
+
pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
|
1990
|
+
count+=4*PacketSize;
|
1991
|
+
}
|
1992
|
+
}
|
1993
|
+
for(; k<depth; k++)
|
1994
|
+
{
|
1995
|
+
blockB[count+0] = cj(dm0(k));
|
1996
|
+
blockB[count+1] = cj(dm1(k));
|
1997
|
+
blockB[count+2] = cj(dm2(k));
|
1998
|
+
blockB[count+3] = cj(dm3(k));
|
1999
|
+
count += 4;
|
2000
|
+
}
|
2001
|
+
// skip what we have after
|
2002
|
+
if(PanelMode) count += 4 * (stride-offset-depth);
|
2003
|
+
}
|
2004
|
+
}
|
2005
|
+
|
2006
|
+
// copy the remaining columns one at a time (nr==1)
|
2007
|
+
for(Index j2=packet_cols4; j2<cols; ++j2)
|
2008
|
+
{
|
2009
|
+
if(PanelMode) count += offset;
|
2010
|
+
const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
|
2011
|
+
for(Index k=0; k<depth; k++)
|
2012
|
+
{
|
2013
|
+
blockB[count] = cj(dm0(k));
|
2014
|
+
count += 1;
|
2015
|
+
}
|
2016
|
+
if(PanelMode) count += (stride-offset-depth);
|
2017
|
+
}
|
2018
|
+
}
|
2019
|
+
|
2020
|
+
// this version is optimized for row major matrices
|
2021
|
+
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
2022
|
+
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
2023
|
+
{
|
2024
|
+
typedef typename packet_traits<Scalar>::type Packet;
|
2025
|
+
typedef typename DataMapper::LinearMapper LinearMapper;
|
2026
|
+
enum { PacketSize = packet_traits<Scalar>::size };
|
2027
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
|
2028
|
+
};
|
2029
|
+
|
2030
|
+
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
2031
|
+
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
2032
|
+
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
2033
|
+
{
|
2034
|
+
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
2035
|
+
EIGEN_UNUSED_VARIABLE(stride);
|
2036
|
+
EIGEN_UNUSED_VARIABLE(offset);
|
2037
|
+
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
2038
|
+
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
2039
|
+
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
2040
|
+
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
2041
|
+
Index count = 0;
|
2042
|
+
|
2043
|
+
// if(nr>=8)
|
2044
|
+
// {
|
2045
|
+
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
2046
|
+
// {
|
2047
|
+
// // skip what we have before
|
2048
|
+
// if(PanelMode) count += 8 * offset;
|
2049
|
+
// for(Index k=0; k<depth; k++)
|
2050
|
+
// {
|
2051
|
+
// if (PacketSize==8) {
|
2052
|
+
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
2053
|
+
// pstoreu(blockB+count, cj.pconj(A));
|
2054
|
+
// } else if (PacketSize==4) {
|
2055
|
+
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
2056
|
+
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
|
2057
|
+
// pstoreu(blockB+count, cj.pconj(A));
|
2058
|
+
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
|
2059
|
+
// } else {
|
2060
|
+
// const Scalar* b0 = &rhs[k*rhsStride + j2];
|
2061
|
+
// blockB[count+0] = cj(b0[0]);
|
2062
|
+
// blockB[count+1] = cj(b0[1]);
|
2063
|
+
// blockB[count+2] = cj(b0[2]);
|
2064
|
+
// blockB[count+3] = cj(b0[3]);
|
2065
|
+
// blockB[count+4] = cj(b0[4]);
|
2066
|
+
// blockB[count+5] = cj(b0[5]);
|
2067
|
+
// blockB[count+6] = cj(b0[6]);
|
2068
|
+
// blockB[count+7] = cj(b0[7]);
|
2069
|
+
// }
|
2070
|
+
// count += 8;
|
2071
|
+
// }
|
2072
|
+
// // skip what we have after
|
2073
|
+
// if(PanelMode) count += 8 * (stride-offset-depth);
|
2074
|
+
// }
|
2075
|
+
// }
|
2076
|
+
if(nr>=4)
|
2077
|
+
{
|
2078
|
+
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
|
2079
|
+
{
|
2080
|
+
// skip what we have before
|
2081
|
+
if(PanelMode) count += 4 * offset;
|
2082
|
+
for(Index k=0; k<depth; k++)
|
2083
|
+
{
|
2084
|
+
if (PacketSize==4) {
|
2085
|
+
Packet A = rhs.loadPacket(k, j2);
|
2086
|
+
pstoreu(blockB+count, cj.pconj(A));
|
2087
|
+
count += PacketSize;
|
2088
|
+
} else {
|
2089
|
+
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
|
2090
|
+
blockB[count+0] = cj(dm0(0));
|
2091
|
+
blockB[count+1] = cj(dm0(1));
|
2092
|
+
blockB[count+2] = cj(dm0(2));
|
2093
|
+
blockB[count+3] = cj(dm0(3));
|
2094
|
+
count += 4;
|
2095
|
+
}
|
2096
|
+
}
|
2097
|
+
// skip what we have after
|
2098
|
+
if(PanelMode) count += 4 * (stride-offset-depth);
|
2099
|
+
}
|
2100
|
+
}
|
2101
|
+
// copy the remaining columns one at a time (nr==1)
|
2102
|
+
for(Index j2=packet_cols4; j2<cols; ++j2)
|
2103
|
+
{
|
2104
|
+
if(PanelMode) count += offset;
|
2105
|
+
for(Index k=0; k<depth; k++)
|
2106
|
+
{
|
2107
|
+
blockB[count] = cj(rhs(k, j2));
|
2108
|
+
count += 1;
|
2109
|
+
}
|
2110
|
+
if(PanelMode) count += stride-offset-depth;
|
2111
|
+
}
|
2112
|
+
}
|
2113
|
+
|
2114
|
+
} // end namespace internal
|
2115
|
+
|
2116
|
+
/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
|
2117
|
+
* \sa setCpuCacheSize */
|
2118
|
+
inline std::ptrdiff_t l1CacheSize()
|
2119
|
+
{
|
2120
|
+
std::ptrdiff_t l1, l2, l3;
|
2121
|
+
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
2122
|
+
return l1;
|
2123
|
+
}
|
2124
|
+
|
2125
|
+
/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
|
2126
|
+
* \sa setCpuCacheSize */
|
2127
|
+
inline std::ptrdiff_t l2CacheSize()
|
2128
|
+
{
|
2129
|
+
std::ptrdiff_t l1, l2, l3;
|
2130
|
+
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
2131
|
+
return l2;
|
2132
|
+
}
|
2133
|
+
|
2134
|
+
/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
|
2135
|
+
rs.
|
2136
|
+
* \sa setCpuCacheSize */
|
2137
|
+
inline std::ptrdiff_t l3CacheSize()
|
2138
|
+
{
|
2139
|
+
std::ptrdiff_t l1, l2, l3;
|
2140
|
+
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
2141
|
+
return l3;
|
2142
|
+
}
|
2143
|
+
|
2144
|
+
/** Set the cpu L1 and L2 cache sizes (in bytes).
|
2145
|
+
* These values are use to adjust the size of the blocks
|
2146
|
+
* for the algorithms working per blocks.
|
2147
|
+
*
|
2148
|
+
* \sa computeProductBlockingSizes */
|
2149
|
+
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
|
2150
|
+
{
|
2151
|
+
internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
|
2152
|
+
}
|
2153
|
+
|
2154
|
+
} // end namespace Eigen
|
2155
|
+
|
2156
|
+
#endif // EIGEN_GENERAL_BLOCK_PANEL_H
|