tomoto 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +8 -10
- data/ext/tomoto/ct.cpp +11 -11
- data/ext/tomoto/dmr.cpp +14 -13
- data/ext/tomoto/dt.cpp +14 -14
- data/ext/tomoto/extconf.rb +7 -5
- data/ext/tomoto/gdmr.cpp +7 -7
- data/ext/tomoto/hdp.cpp +9 -9
- data/ext/tomoto/hlda.cpp +13 -13
- data/ext/tomoto/hpa.cpp +5 -5
- data/ext/tomoto/lda.cpp +42 -39
- data/ext/tomoto/llda.cpp +6 -6
- data/ext/tomoto/mglda.cpp +15 -15
- data/ext/tomoto/pa.cpp +6 -6
- data/ext/tomoto/plda.cpp +6 -6
- data/ext/tomoto/slda.cpp +8 -8
- data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
- data/ext/tomoto/utils.h +16 -70
- data/lib/tomoto/version.rb +1 -1
- data/lib/tomoto.rb +5 -1
- data/vendor/EigenRand/EigenRand/Core.h +10 -10
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
- data/vendor/EigenRand/EigenRand/EigenRand +11 -6
- data/vendor/EigenRand/EigenRand/Macro.h +13 -7
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
- data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
- data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
- data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
- data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
- data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
- data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
- data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
- data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
- data/vendor/EigenRand/EigenRand/doc.h +24 -12
- data/vendor/EigenRand/README.md +57 -4
- data/vendor/eigen/COPYING.APACHE +203 -0
- data/vendor/eigen/COPYING.BSD +1 -1
- data/vendor/eigen/COPYING.MINPACK +51 -52
- data/vendor/eigen/Eigen/Cholesky +0 -1
- data/vendor/eigen/Eigen/Core +112 -265
- data/vendor/eigen/Eigen/Eigenvalues +2 -3
- data/vendor/eigen/Eigen/Geometry +5 -8
- data/vendor/eigen/Eigen/Householder +0 -1
- data/vendor/eigen/Eigen/Jacobi +0 -1
- data/vendor/eigen/Eigen/KLUSupport +41 -0
- data/vendor/eigen/Eigen/LU +2 -5
- data/vendor/eigen/Eigen/OrderingMethods +0 -3
- data/vendor/eigen/Eigen/PaStiXSupport +1 -0
- data/vendor/eigen/Eigen/PardisoSupport +0 -0
- data/vendor/eigen/Eigen/QR +2 -3
- data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
- data/vendor/eigen/Eigen/SVD +0 -1
- data/vendor/eigen/Eigen/Sparse +0 -2
- data/vendor/eigen/Eigen/SparseCholesky +0 -8
- data/vendor/eigen/Eigen/SparseLU +4 -0
- data/vendor/eigen/Eigen/SparseQR +0 -1
- data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
- data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
- data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
- data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
- data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
- data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
- data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
- data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
- data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
- data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
- data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
- data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
- data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
- data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
- data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
- data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
- data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
- data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
- data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
- data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
- data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
- data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
- data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
- data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
- data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
- data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
- data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
- data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
- data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
- data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
- data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
- data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
- data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
- data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
- data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
- data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
- data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
- data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
- data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
- data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
- data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
- data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
- data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
- data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
- data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
- data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
- data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
- data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
- data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
- data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
- data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
- data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
- data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
- data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
- data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
- data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
- data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
- data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
- data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
- data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
- data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
- data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
- data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
- data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
- data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
- data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
- data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
- data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
- data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
- data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
- data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
- data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
- data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
- data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
- data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
- data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
- data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
- data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
- data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
- data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
- data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
- data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
- data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
- data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
- data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
- data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
- data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
- data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
- data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
- data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
- data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
- data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
- data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
- data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
- data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
- data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
- data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
- data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
- data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
- data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
- data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
- data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
- data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
- data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
- data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
- data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
- data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
- data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
- data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
- data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
- data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
- data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
- data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
- data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
- data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
- data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
- data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
- data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
- data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
- data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
- data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
- data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
- data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
- data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
- data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
- data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
- data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
- data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
- data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
- data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
- data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
- data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
- data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
- data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
- data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
- data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
- data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
- data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
- data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
- data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
- data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
- data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
- data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
- data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
- data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
- data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
- data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
- data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
- data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
- data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
- data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
- data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
- data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
- data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
- data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
- data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
- data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
- data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
- data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
- data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
- data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
- data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
- data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
- data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
- data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
- data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
- data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
- data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
- data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
- data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
- data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
- data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
- data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
- data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
- data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
- data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
- data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
- data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
- data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
- data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
- data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
- data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
- data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
- data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
- data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
- data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
- data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
- data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
- data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
- data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
- data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
- data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
- data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
- data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
- data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
- data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
- data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
- data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
- data/vendor/eigen/README.md +2 -0
- data/vendor/eigen/bench/btl/README +1 -1
- data/vendor/eigen/bench/tensors/README +6 -7
- data/vendor/eigen/ci/README.md +56 -0
- data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
- data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
- data/vendor/eigen/unsupported/README.txt +1 -1
- data/vendor/tomotopy/README.kr.rst +78 -0
- data/vendor/tomotopy/README.rst +75 -0
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
- data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
- data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
- data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
- data/vendor/tomotopy/src/Utils/exception.h +6 -0
- data/vendor/tomotopy/src/Utils/math.h +2 -2
- data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
- data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
- data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
- metadata +64 -18
- data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
|
@@ -15,7 +15,13 @@ namespace Eigen {
|
|
|
15
15
|
|
|
16
16
|
namespace internal {
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
enum GEBPPacketSizeType {
|
|
19
|
+
GEBPPacketFull = 0,
|
|
20
|
+
GEBPPacketHalf,
|
|
21
|
+
GEBPPacketQuarter
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
|
|
19
25
|
class gebp_traits;
|
|
20
26
|
|
|
21
27
|
|
|
@@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
|
|
|
25
31
|
return a<=0 ? b : a;
|
|
26
32
|
}
|
|
27
33
|
|
|
34
|
+
#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
|
|
35
|
+
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
|
|
36
|
+
#else
|
|
37
|
+
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
|
|
38
|
+
#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
|
|
39
|
+
|
|
40
|
+
#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
|
|
41
|
+
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
|
|
42
|
+
#else
|
|
43
|
+
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
|
|
44
|
+
#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
|
|
45
|
+
|
|
46
|
+
#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
|
|
47
|
+
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
|
|
48
|
+
#else
|
|
49
|
+
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
|
|
50
|
+
#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
|
|
51
|
+
|
|
28
52
|
#if EIGEN_ARCH_i386_OR_x86_64
|
|
29
|
-
const std::ptrdiff_t defaultL1CacheSize = 32*1024;
|
|
30
|
-
const std::ptrdiff_t defaultL2CacheSize = 256*1024;
|
|
31
|
-
const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
|
|
53
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
|
|
54
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
|
|
55
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
|
|
56
|
+
#elif EIGEN_ARCH_PPC
|
|
57
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
|
|
58
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
|
|
59
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
|
|
32
60
|
#else
|
|
33
|
-
const std::ptrdiff_t defaultL1CacheSize = 16*1024;
|
|
34
|
-
const std::ptrdiff_t defaultL2CacheSize = 512*1024;
|
|
35
|
-
const std::ptrdiff_t defaultL3CacheSize = 512*1024;
|
|
61
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
|
|
62
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
|
|
63
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
|
|
36
64
|
#endif
|
|
37
65
|
|
|
66
|
+
#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
|
|
67
|
+
#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
|
|
68
|
+
#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
|
|
69
|
+
|
|
38
70
|
/** \internal */
|
|
39
71
|
struct CacheSizes {
|
|
40
72
|
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
|
|
@@ -50,7 +82,6 @@ struct CacheSizes {
|
|
|
50
82
|
std::ptrdiff_t m_l3;
|
|
51
83
|
};
|
|
52
84
|
|
|
53
|
-
|
|
54
85
|
/** \internal */
|
|
55
86
|
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
|
|
56
87
|
{
|
|
@@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
101
132
|
// at the register level. This small horizontal panel has to stay within L1 cache.
|
|
102
133
|
std::ptrdiff_t l1, l2, l3;
|
|
103
134
|
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
|
135
|
+
#ifdef EIGEN_VECTORIZE_AVX512
|
|
136
|
+
// We need to find a rationale for that, but without this adjustment,
|
|
137
|
+
// performance with AVX512 is pretty bad, like -20% slower.
|
|
138
|
+
// One reason is that with increasing packet-size, the blocking size k
|
|
139
|
+
// has to become pretty small if we want that 1 lhs panel fit within L1.
|
|
140
|
+
// For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
|
|
141
|
+
// k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
|
|
142
|
+
// This is quite small for a good reuse of the accumulation registers.
|
|
143
|
+
l1 *= 4;
|
|
144
|
+
#endif
|
|
104
145
|
|
|
105
146
|
if (num_threads > 1) {
|
|
106
147
|
typedef typename Traits::ResScalar ResScalar;
|
|
@@ -115,7 +156,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
115
156
|
// registers. However once the latency is hidden there is no point in
|
|
116
157
|
// increasing the value of k, so we'll cap it at 320 (value determined
|
|
117
158
|
// experimentally).
|
|
118
|
-
|
|
159
|
+
// To avoid that k vanishes, we make k_cache at least as big as kr
|
|
160
|
+
const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
|
|
119
161
|
if (k_cache < k) {
|
|
120
162
|
k = k_cache - (k_cache % kr);
|
|
121
163
|
eigen_internal_assert(k > 0);
|
|
@@ -307,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
|
|
|
307
349
|
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
|
|
308
350
|
}
|
|
309
351
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
|
|
318
|
-
{
|
|
319
|
-
c = cj.pmadd(a,b,c);
|
|
320
|
-
}
|
|
321
|
-
};
|
|
322
|
-
|
|
323
|
-
template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
|
|
324
|
-
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
|
|
325
|
-
{
|
|
326
|
-
t = b; t = cj.pmul(a,t); c = padd(c,t);
|
|
327
|
-
}
|
|
328
|
-
};
|
|
352
|
+
template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
|
|
353
|
+
struct RhsPanelHelper {
|
|
354
|
+
private:
|
|
355
|
+
static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
|
|
356
|
+
public:
|
|
357
|
+
typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
|
|
358
|
+
};
|
|
329
359
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
}
|
|
360
|
+
template <typename Packet>
|
|
361
|
+
struct QuadPacket
|
|
362
|
+
{
|
|
363
|
+
Packet B_0, B1, B2, B3;
|
|
364
|
+
const Packet& get(const FixedInt<0>&) const { return B_0; }
|
|
365
|
+
const Packet& get(const FixedInt<1>&) const { return B1; }
|
|
366
|
+
const Packet& get(const FixedInt<2>&) const { return B2; }
|
|
367
|
+
const Packet& get(const FixedInt<3>&) const { return B3; }
|
|
368
|
+
};
|
|
335
369
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
370
|
+
template <int N, typename T1, typename T2, typename T3>
|
|
371
|
+
struct packet_conditional { typedef T3 type; };
|
|
372
|
+
|
|
373
|
+
template <typename T1, typename T2, typename T3>
|
|
374
|
+
struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
|
|
375
|
+
|
|
376
|
+
template <typename T1, typename T2, typename T3>
|
|
377
|
+
struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
|
|
378
|
+
|
|
379
|
+
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
|
|
380
|
+
typedef typename packet_conditional<packet_size, \
|
|
381
|
+
typename packet_traits<name ## Scalar>::type, \
|
|
382
|
+
typename packet_traits<name ## Scalar>::half, \
|
|
383
|
+
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
|
|
384
|
+
prefix ## name ## Packet
|
|
385
|
+
|
|
386
|
+
#define PACKET_DECL_COND(name, packet_size) \
|
|
387
|
+
typedef typename packet_conditional<packet_size, \
|
|
388
|
+
typename packet_traits<name ## Scalar>::type, \
|
|
389
|
+
typename packet_traits<name ## Scalar>::half, \
|
|
390
|
+
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
|
|
391
|
+
name ## Packet
|
|
392
|
+
|
|
393
|
+
#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
|
|
394
|
+
typedef typename packet_conditional<packet_size, \
|
|
395
|
+
typename packet_traits<Scalar>::type, \
|
|
396
|
+
typename packet_traits<Scalar>::half, \
|
|
397
|
+
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
|
|
398
|
+
prefix ## ScalarPacket
|
|
399
|
+
|
|
400
|
+
#define PACKET_DECL_COND_SCALAR(packet_size) \
|
|
401
|
+
typedef typename packet_conditional<packet_size, \
|
|
402
|
+
typename packet_traits<Scalar>::type, \
|
|
403
|
+
typename packet_traits<Scalar>::half, \
|
|
404
|
+
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
|
|
405
|
+
ScalarPacket
|
|
339
406
|
|
|
340
407
|
/* Vectorization logic
|
|
341
408
|
* real*real: unpack rhs to constant packets, ...
|
|
@@ -347,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
|
|
|
347
414
|
* cplx*real : unpack rhs to constant packets, ...
|
|
348
415
|
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
|
|
349
416
|
*/
|
|
350
|
-
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
|
|
417
|
+
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
|
|
351
418
|
class gebp_traits
|
|
352
419
|
{
|
|
353
420
|
public:
|
|
@@ -355,13 +422,17 @@ public:
|
|
|
355
422
|
typedef _RhsScalar RhsScalar;
|
|
356
423
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
357
424
|
|
|
425
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
426
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
427
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
428
|
+
|
|
358
429
|
enum {
|
|
359
430
|
ConjLhs = _ConjLhs,
|
|
360
431
|
ConjRhs = _ConjRhs,
|
|
361
|
-
Vectorizable =
|
|
362
|
-
LhsPacketSize = Vectorizable ?
|
|
363
|
-
RhsPacketSize = Vectorizable ?
|
|
364
|
-
ResPacketSize = Vectorizable ?
|
|
432
|
+
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
|
|
433
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
434
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
435
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
365
436
|
|
|
366
437
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
367
438
|
|
|
@@ -370,10 +441,12 @@ public:
|
|
|
370
441
|
|
|
371
442
|
// register block size along the M direction (currently, this one cannot be modified)
|
|
372
443
|
default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
|
|
373
|
-
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
|
|
374
|
-
|
|
444
|
+
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
|
|
445
|
+
&& ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
|
|
446
|
+
// we assume 16 registers or more
|
|
375
447
|
// See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
|
|
376
448
|
// then using 3*LhsPacketSize triggers non-implemented paths in syrk.
|
|
449
|
+
// Bug 1515: MSVC prior to v19.14 yields to register spilling.
|
|
377
450
|
mr = Vectorizable ? 3*LhsPacketSize : default_mr,
|
|
378
451
|
#else
|
|
379
452
|
mr = default_mr,
|
|
@@ -383,37 +456,41 @@ public:
|
|
|
383
456
|
RhsProgress = 1
|
|
384
457
|
};
|
|
385
458
|
|
|
386
|
-
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
387
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
388
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
389
459
|
|
|
390
460
|
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
391
461
|
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
392
462
|
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
463
|
+
typedef LhsPacket LhsPacket4Packing;
|
|
393
464
|
|
|
465
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
394
466
|
typedef ResPacket AccPacket;
|
|
395
467
|
|
|
396
468
|
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
397
469
|
{
|
|
398
470
|
p = pset1<ResPacket>(ResScalar(0));
|
|
399
471
|
}
|
|
400
|
-
|
|
401
|
-
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
|
|
402
|
-
{
|
|
403
|
-
pbroadcast4(b, b0, b1, b2, b3);
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
|
407
|
-
// {
|
|
408
|
-
// pbroadcast2(b, b0, b1);
|
|
409
|
-
// }
|
|
410
|
-
|
|
472
|
+
|
|
411
473
|
template<typename RhsPacketType>
|
|
412
474
|
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
413
475
|
{
|
|
414
476
|
dest = pset1<RhsPacketType>(*b);
|
|
415
477
|
}
|
|
416
|
-
|
|
478
|
+
|
|
479
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
480
|
+
{
|
|
481
|
+
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
template<typename RhsPacketType>
|
|
485
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
486
|
+
{
|
|
487
|
+
loadRhs(b, dest);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
491
|
+
{
|
|
492
|
+
}
|
|
493
|
+
|
|
417
494
|
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
418
495
|
{
|
|
419
496
|
dest = ploadquad<RhsPacket>(b);
|
|
@@ -431,8 +508,8 @@ public:
|
|
|
431
508
|
dest = ploadu<LhsPacketType>(a);
|
|
432
509
|
}
|
|
433
510
|
|
|
434
|
-
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
435
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
|
|
511
|
+
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
512
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
|
|
436
513
|
{
|
|
437
514
|
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
|
|
438
515
|
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
|
|
@@ -447,6 +524,12 @@ public:
|
|
|
447
524
|
#endif
|
|
448
525
|
}
|
|
449
526
|
|
|
527
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
528
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
529
|
+
{
|
|
530
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
531
|
+
}
|
|
532
|
+
|
|
450
533
|
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
|
451
534
|
{
|
|
452
535
|
r = pmadd(c,alpha,r);
|
|
@@ -460,21 +543,25 @@ public:
|
|
|
460
543
|
|
|
461
544
|
};
|
|
462
545
|
|
|
463
|
-
template<typename RealScalar, bool _ConjLhs>
|
|
464
|
-
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
|
|
546
|
+
template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
|
|
547
|
+
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
|
|
465
548
|
{
|
|
466
549
|
public:
|
|
467
550
|
typedef std::complex<RealScalar> LhsScalar;
|
|
468
551
|
typedef RealScalar RhsScalar;
|
|
469
552
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
470
553
|
|
|
554
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
555
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
556
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
557
|
+
|
|
471
558
|
enum {
|
|
472
559
|
ConjLhs = _ConjLhs,
|
|
473
560
|
ConjRhs = false,
|
|
474
|
-
Vectorizable =
|
|
475
|
-
LhsPacketSize = Vectorizable ?
|
|
476
|
-
RhsPacketSize = Vectorizable ?
|
|
477
|
-
ResPacketSize = Vectorizable ?
|
|
561
|
+
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
|
|
562
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
563
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
564
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
478
565
|
|
|
479
566
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
480
567
|
nr = 4,
|
|
@@ -489,13 +576,12 @@ public:
|
|
|
489
576
|
RhsProgress = 1
|
|
490
577
|
};
|
|
491
578
|
|
|
492
|
-
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
493
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
494
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
495
|
-
|
|
496
579
|
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
497
580
|
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
498
581
|
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
582
|
+
typedef LhsPacket LhsPacket4Packing;
|
|
583
|
+
|
|
584
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
499
585
|
|
|
500
586
|
typedef ResPacket AccPacket;
|
|
501
587
|
|
|
@@ -504,42 +590,64 @@ public:
|
|
|
504
590
|
p = pset1<ResPacket>(ResScalar(0));
|
|
505
591
|
}
|
|
506
592
|
|
|
507
|
-
|
|
593
|
+
template<typename RhsPacketType>
|
|
594
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
508
595
|
{
|
|
509
|
-
dest = pset1<
|
|
596
|
+
dest = pset1<RhsPacketType>(*b);
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
600
|
+
{
|
|
601
|
+
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
510
602
|
}
|
|
603
|
+
|
|
604
|
+
template<typename RhsPacketType>
|
|
605
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
606
|
+
{
|
|
607
|
+
loadRhs(b, dest);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
611
|
+
{}
|
|
511
612
|
|
|
512
613
|
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
513
614
|
{
|
|
514
|
-
dest
|
|
615
|
+
loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
|
|
515
616
|
}
|
|
516
617
|
|
|
517
|
-
EIGEN_STRONG_INLINE void
|
|
618
|
+
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
|
|
518
619
|
{
|
|
519
|
-
|
|
620
|
+
// FIXME we can do better!
|
|
621
|
+
// what we want here is a ploadheight
|
|
622
|
+
RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
|
|
623
|
+
dest = ploadquad<RhsPacket>(tmp);
|
|
520
624
|
}
|
|
521
625
|
|
|
522
|
-
EIGEN_STRONG_INLINE void
|
|
626
|
+
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
|
|
523
627
|
{
|
|
524
|
-
|
|
628
|
+
eigen_internal_assert(RhsPacketSize<=8);
|
|
629
|
+
dest = pset1<RhsPacket>(*b);
|
|
525
630
|
}
|
|
526
631
|
|
|
527
|
-
EIGEN_STRONG_INLINE void
|
|
632
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
528
633
|
{
|
|
529
|
-
|
|
634
|
+
dest = pload<LhsPacket>(a);
|
|
530
635
|
}
|
|
531
|
-
|
|
532
|
-
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
|
533
|
-
// {
|
|
534
|
-
// pbroadcast2(b, b0, b1);
|
|
535
|
-
// }
|
|
536
636
|
|
|
537
|
-
|
|
637
|
+
template<typename LhsPacketType>
|
|
638
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
639
|
+
{
|
|
640
|
+
dest = ploadu<LhsPacketType>(a);
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
644
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
|
|
538
645
|
{
|
|
539
646
|
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
|
|
540
647
|
}
|
|
541
648
|
|
|
542
|
-
|
|
649
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
650
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
|
|
543
651
|
{
|
|
544
652
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
545
653
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
@@ -554,13 +662,20 @@ public:
|
|
|
554
662
|
c += a * b;
|
|
555
663
|
}
|
|
556
664
|
|
|
557
|
-
|
|
665
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
666
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
558
667
|
{
|
|
668
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
template <typename ResPacketType, typename AccPacketType>
|
|
672
|
+
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
673
|
+
{
|
|
674
|
+
conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
|
|
559
675
|
r = cj.pmadd(c,alpha,r);
|
|
560
676
|
}
|
|
561
677
|
|
|
562
678
|
protected:
|
|
563
|
-
conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
|
|
564
679
|
};
|
|
565
680
|
|
|
566
681
|
template<typename Packet>
|
|
@@ -579,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
|
|
|
579
694
|
return res;
|
|
580
695
|
}
|
|
581
696
|
|
|
697
|
+
// note that for DoublePacket<RealPacket> the "4" in "downto4"
|
|
698
|
+
// corresponds to the number of complexes, so it means "8"
|
|
699
|
+
// it terms of real coefficients.
|
|
700
|
+
|
|
582
701
|
template<typename Packet>
|
|
583
|
-
const DoublePacket<Packet>&
|
|
702
|
+
const DoublePacket<Packet>&
|
|
703
|
+
predux_half_dowto4(const DoublePacket<Packet> &a,
|
|
704
|
+
typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
|
|
584
705
|
{
|
|
585
706
|
return a;
|
|
586
707
|
}
|
|
587
708
|
|
|
588
|
-
template<typename Packet>
|
|
709
|
+
template<typename Packet>
|
|
710
|
+
DoublePacket<typename unpacket_traits<Packet>::half>
|
|
711
|
+
predux_half_dowto4(const DoublePacket<Packet> &a,
|
|
712
|
+
typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
|
|
713
|
+
{
|
|
714
|
+
// yes, that's pretty hackish :(
|
|
715
|
+
DoublePacket<typename unpacket_traits<Packet>::half> res;
|
|
716
|
+
typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
|
|
717
|
+
typedef typename packet_traits<Cplx>::type CplxPacket;
|
|
718
|
+
res.first = predux_half_dowto4(CplxPacket(a.first)).v;
|
|
719
|
+
res.second = predux_half_dowto4(CplxPacket(a.second)).v;
|
|
720
|
+
return res;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
// same here, "quad" actually means "8" in terms of real coefficients
|
|
724
|
+
template<typename Scalar, typename RealPacket>
|
|
725
|
+
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
|
726
|
+
typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
|
|
727
|
+
{
|
|
728
|
+
dest.first = pset1<RealPacket>(numext::real(*b));
|
|
729
|
+
dest.second = pset1<RealPacket>(numext::imag(*b));
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
template<typename Scalar, typename RealPacket>
|
|
733
|
+
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
|
734
|
+
typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
|
|
735
|
+
{
|
|
736
|
+
// yes, that's pretty hackish too :(
|
|
737
|
+
typedef typename NumTraits<Scalar>::Real RealScalar;
|
|
738
|
+
RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
|
|
739
|
+
RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
|
|
740
|
+
dest.first = ploadquad<RealPacket>(r);
|
|
741
|
+
dest.second = ploadquad<RealPacket>(i);
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
|
|
746
|
+
typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
|
|
747
|
+
};
|
|
589
748
|
// template<typename Packet>
|
|
590
749
|
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
|
|
591
750
|
// {
|
|
@@ -595,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
|
|
|
595
754
|
// return res;
|
|
596
755
|
// }
|
|
597
756
|
|
|
598
|
-
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
|
|
599
|
-
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
|
|
757
|
+
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
|
|
758
|
+
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
|
|
600
759
|
{
|
|
601
760
|
public:
|
|
602
761
|
typedef std::complex<RealScalar> Scalar;
|
|
@@ -604,15 +763,21 @@ public:
|
|
|
604
763
|
typedef std::complex<RealScalar> RhsScalar;
|
|
605
764
|
typedef std::complex<RealScalar> ResScalar;
|
|
606
765
|
|
|
766
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
767
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
768
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
769
|
+
PACKET_DECL_COND(Real, _PacketSize);
|
|
770
|
+
PACKET_DECL_COND_SCALAR(_PacketSize);
|
|
771
|
+
|
|
607
772
|
enum {
|
|
608
773
|
ConjLhs = _ConjLhs,
|
|
609
774
|
ConjRhs = _ConjRhs,
|
|
610
|
-
Vectorizable =
|
|
611
|
-
&&
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
775
|
+
Vectorizable = unpacket_traits<RealPacket>::vectorizable
|
|
776
|
+
&& unpacket_traits<ScalarPacket>::vectorizable,
|
|
777
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
778
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
779
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
|
|
780
|
+
RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
|
|
616
781
|
|
|
617
782
|
// FIXME: should depend on NumberOfRegisters
|
|
618
783
|
nr = 4,
|
|
@@ -622,14 +787,16 @@ public:
|
|
|
622
787
|
RhsProgress = 1
|
|
623
788
|
};
|
|
624
789
|
|
|
625
|
-
typedef
|
|
626
|
-
typedef typename packet_traits<Scalar>::type ScalarPacket;
|
|
627
|
-
typedef DoublePacket<RealPacket> DoublePacketType;
|
|
790
|
+
typedef DoublePacket<RealPacket> DoublePacketType;
|
|
628
791
|
|
|
792
|
+
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
|
|
629
793
|
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
|
|
630
794
|
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
|
|
631
795
|
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
|
|
632
796
|
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
|
|
797
|
+
|
|
798
|
+
// this actualy holds 8 packets!
|
|
799
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
633
800
|
|
|
634
801
|
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
|
|
635
802
|
|
|
@@ -640,51 +807,49 @@ public:
|
|
|
640
807
|
}
|
|
641
808
|
|
|
642
809
|
// Scalar path
|
|
643
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b,
|
|
810
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
|
|
644
811
|
{
|
|
645
|
-
dest = pset1<
|
|
812
|
+
dest = pset1<ScalarPacket>(*b);
|
|
646
813
|
}
|
|
647
814
|
|
|
648
815
|
// Vectorized path
|
|
649
|
-
|
|
816
|
+
template<typename RealPacketType>
|
|
817
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
|
|
650
818
|
{
|
|
651
|
-
dest.first = pset1<
|
|
652
|
-
dest.second = pset1<
|
|
819
|
+
dest.first = pset1<RealPacketType>(numext::real(*b));
|
|
820
|
+
dest.second = pset1<RealPacketType>(numext::imag(*b));
|
|
653
821
|
}
|
|
654
|
-
|
|
655
|
-
EIGEN_STRONG_INLINE void
|
|
822
|
+
|
|
823
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
656
824
|
{
|
|
657
|
-
loadRhs(b,dest);
|
|
825
|
+
loadRhs(b, dest.B_0);
|
|
826
|
+
loadRhs(b + 1, dest.B1);
|
|
827
|
+
loadRhs(b + 2, dest.B2);
|
|
828
|
+
loadRhs(b + 3, dest.B3);
|
|
658
829
|
}
|
|
659
|
-
|
|
830
|
+
|
|
831
|
+
// Scalar path
|
|
832
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
|
|
660
833
|
{
|
|
661
|
-
|
|
662
|
-
loadRhs(b,dest);
|
|
834
|
+
loadRhs(b, dest);
|
|
663
835
|
}
|
|
664
|
-
|
|
665
|
-
|
|
836
|
+
|
|
837
|
+
// Vectorized path
|
|
838
|
+
template<typename RealPacketType>
|
|
839
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
|
|
666
840
|
{
|
|
667
|
-
|
|
668
|
-
loadRhs(b+0, b0);
|
|
669
|
-
loadRhs(b+1, b1);
|
|
670
|
-
loadRhs(b+2, b2);
|
|
671
|
-
loadRhs(b+3, b3);
|
|
841
|
+
loadRhs(b, dest);
|
|
672
842
|
}
|
|
843
|
+
|
|
844
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
|
|
673
845
|
|
|
674
|
-
|
|
675
|
-
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
|
|
846
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
|
|
676
847
|
{
|
|
677
|
-
|
|
678
|
-
loadRhs(b+0, b0);
|
|
679
|
-
loadRhs(b+1, b1);
|
|
848
|
+
loadRhs(b,dest);
|
|
680
849
|
}
|
|
681
|
-
|
|
682
|
-
// Scalar path
|
|
683
|
-
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
|
|
850
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
|
|
684
851
|
{
|
|
685
|
-
|
|
686
|
-
loadRhs(b+0, b0);
|
|
687
|
-
loadRhs(b+1, b1);
|
|
852
|
+
loadQuadToDoublePacket(b,dest);
|
|
688
853
|
}
|
|
689
854
|
|
|
690
855
|
// nothing special here
|
|
@@ -693,47 +858,59 @@ public:
|
|
|
693
858
|
dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
|
|
694
859
|
}
|
|
695
860
|
|
|
696
|
-
|
|
861
|
+
template<typename LhsPacketType>
|
|
862
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
697
863
|
{
|
|
698
|
-
dest = ploadu<
|
|
864
|
+
dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
|
|
699
865
|
}
|
|
700
866
|
|
|
701
|
-
|
|
867
|
+
template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
|
|
868
|
+
EIGEN_STRONG_INLINE
|
|
869
|
+
typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
|
|
870
|
+
madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
|
|
702
871
|
{
|
|
703
872
|
c.first = padd(pmul(a,b.first), c.first);
|
|
704
873
|
c.second = padd(pmul(a,b.second),c.second);
|
|
705
874
|
}
|
|
706
875
|
|
|
707
|
-
|
|
876
|
+
template<typename LaneIdType>
|
|
877
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
|
|
708
878
|
{
|
|
709
879
|
c = cj.pmadd(a,b,c);
|
|
710
880
|
}
|
|
881
|
+
|
|
882
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
883
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
884
|
+
{
|
|
885
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
886
|
+
}
|
|
711
887
|
|
|
712
888
|
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
|
|
713
889
|
|
|
714
|
-
|
|
890
|
+
template<typename RealPacketType, typename ResPacketType>
|
|
891
|
+
EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
715
892
|
{
|
|
716
893
|
// assemble c
|
|
717
|
-
|
|
894
|
+
ResPacketType tmp;
|
|
718
895
|
if((!ConjLhs)&&(!ConjRhs))
|
|
719
896
|
{
|
|
720
|
-
tmp = pcplxflip(pconj(
|
|
721
|
-
tmp = padd(
|
|
897
|
+
tmp = pcplxflip(pconj(ResPacketType(c.second)));
|
|
898
|
+
tmp = padd(ResPacketType(c.first),tmp);
|
|
722
899
|
}
|
|
723
900
|
else if((!ConjLhs)&&(ConjRhs))
|
|
724
901
|
{
|
|
725
|
-
tmp = pconj(pcplxflip(
|
|
726
|
-
tmp = padd(
|
|
902
|
+
tmp = pconj(pcplxflip(ResPacketType(c.second)));
|
|
903
|
+
tmp = padd(ResPacketType(c.first),tmp);
|
|
727
904
|
}
|
|
728
905
|
else if((ConjLhs)&&(!ConjRhs))
|
|
729
906
|
{
|
|
730
|
-
tmp = pcplxflip(
|
|
731
|
-
tmp = padd(pconj(
|
|
907
|
+
tmp = pcplxflip(ResPacketType(c.second));
|
|
908
|
+
tmp = padd(pconj(ResPacketType(c.first)),tmp);
|
|
732
909
|
}
|
|
733
910
|
else if((ConjLhs)&&(ConjRhs))
|
|
734
911
|
{
|
|
735
|
-
tmp = pcplxflip(
|
|
736
|
-
tmp = psub(pconj(
|
|
912
|
+
tmp = pcplxflip(ResPacketType(c.second));
|
|
913
|
+
tmp = psub(pconj(ResPacketType(c.first)),tmp);
|
|
737
914
|
}
|
|
738
915
|
|
|
739
916
|
r = pmadd(tmp,alpha,r);
|
|
@@ -743,8 +920,8 @@ protected:
|
|
|
743
920
|
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
|
744
921
|
};
|
|
745
922
|
|
|
746
|
-
template<typename RealScalar, bool _ConjRhs>
|
|
747
|
-
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
|
|
923
|
+
template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
|
|
924
|
+
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
|
|
748
925
|
{
|
|
749
926
|
public:
|
|
750
927
|
typedef std::complex<RealScalar> Scalar;
|
|
@@ -752,14 +929,25 @@ public:
|
|
|
752
929
|
typedef Scalar RhsScalar;
|
|
753
930
|
typedef Scalar ResScalar;
|
|
754
931
|
|
|
932
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
933
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
934
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
935
|
+
PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
|
|
936
|
+
PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
|
|
937
|
+
|
|
938
|
+
#undef PACKET_DECL_COND_SCALAR_PREFIX
|
|
939
|
+
#undef PACKET_DECL_COND_PREFIX
|
|
940
|
+
#undef PACKET_DECL_COND_SCALAR
|
|
941
|
+
#undef PACKET_DECL_COND
|
|
942
|
+
|
|
755
943
|
enum {
|
|
756
944
|
ConjLhs = false,
|
|
757
945
|
ConjRhs = _ConjRhs,
|
|
758
|
-
Vectorizable =
|
|
759
|
-
&&
|
|
760
|
-
LhsPacketSize = Vectorizable ?
|
|
761
|
-
RhsPacketSize = Vectorizable ?
|
|
762
|
-
ResPacketSize = Vectorizable ?
|
|
946
|
+
Vectorizable = unpacket_traits<_RealPacket>::vectorizable
|
|
947
|
+
&& unpacket_traits<_ScalarPacket>::vectorizable,
|
|
948
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
949
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
950
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
763
951
|
|
|
764
952
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
765
953
|
// FIXME: should depend on NumberOfRegisters
|
|
@@ -770,14 +958,11 @@ public:
|
|
|
770
958
|
RhsProgress = 1
|
|
771
959
|
};
|
|
772
960
|
|
|
773
|
-
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
774
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
775
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
776
|
-
|
|
777
961
|
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
778
962
|
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
779
963
|
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
780
|
-
|
|
964
|
+
typedef LhsPacket LhsPacket4Packing;
|
|
965
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
781
966
|
typedef ResPacket AccPacket;
|
|
782
967
|
|
|
783
968
|
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
@@ -785,22 +970,25 @@ public:
|
|
|
785
970
|
p = pset1<ResPacket>(ResScalar(0));
|
|
786
971
|
}
|
|
787
972
|
|
|
788
|
-
|
|
973
|
+
template<typename RhsPacketType>
|
|
974
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
789
975
|
{
|
|
790
|
-
dest = pset1<
|
|
976
|
+
dest = pset1<RhsPacketType>(*b);
|
|
791
977
|
}
|
|
792
|
-
|
|
793
|
-
void
|
|
978
|
+
|
|
979
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
794
980
|
{
|
|
795
|
-
pbroadcast4(b,
|
|
981
|
+
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
796
982
|
}
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
983
|
+
|
|
984
|
+
template<typename RhsPacketType>
|
|
985
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
986
|
+
{
|
|
987
|
+
loadRhs(b, dest);
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
991
|
+
{}
|
|
804
992
|
|
|
805
993
|
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
806
994
|
{
|
|
@@ -809,21 +997,23 @@ public:
|
|
|
809
997
|
|
|
810
998
|
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
811
999
|
{
|
|
812
|
-
|
|
813
|
-
loadRhs(b,dest);
|
|
1000
|
+
dest = ploadquad<RhsPacket>(b);
|
|
814
1001
|
}
|
|
815
1002
|
|
|
816
|
-
|
|
1003
|
+
template<typename LhsPacketType>
|
|
1004
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
817
1005
|
{
|
|
818
|
-
dest = ploaddup<
|
|
1006
|
+
dest = ploaddup<LhsPacketType>(a);
|
|
819
1007
|
}
|
|
820
1008
|
|
|
821
|
-
|
|
1009
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
1010
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
|
|
822
1011
|
{
|
|
823
1012
|
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
|
|
824
1013
|
}
|
|
825
1014
|
|
|
826
|
-
|
|
1015
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
1016
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
|
|
827
1017
|
{
|
|
828
1018
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
829
1019
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
@@ -839,16 +1029,24 @@ public:
|
|
|
839
1029
|
c += a * b;
|
|
840
1030
|
}
|
|
841
1031
|
|
|
842
|
-
|
|
1032
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
1033
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
1034
|
+
{
|
|
1035
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
template <typename ResPacketType, typename AccPacketType>
|
|
1039
|
+
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
843
1040
|
{
|
|
1041
|
+
conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
|
|
844
1042
|
r = cj.pmadd(alpha,c,r);
|
|
845
1043
|
}
|
|
846
1044
|
|
|
847
1045
|
protected:
|
|
848
|
-
|
|
1046
|
+
|
|
849
1047
|
};
|
|
850
1048
|
|
|
851
|
-
/* optimized
|
|
1049
|
+
/* optimized General packed Block * packed Panel product kernel
|
|
852
1050
|
*
|
|
853
1051
|
* Mixing type logic: C += A * B
|
|
854
1052
|
* | A | B | comments
|
|
@@ -858,26 +1056,47 @@ protected:
|
|
|
858
1056
|
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
|
859
1057
|
struct gebp_kernel
|
|
860
1058
|
{
|
|
861
|
-
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
|
|
1059
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1060
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
|
|
1061
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
|
|
1062
|
+
|
|
862
1063
|
typedef typename Traits::ResScalar ResScalar;
|
|
863
1064
|
typedef typename Traits::LhsPacket LhsPacket;
|
|
864
1065
|
typedef typename Traits::RhsPacket RhsPacket;
|
|
865
1066
|
typedef typename Traits::ResPacket ResPacket;
|
|
866
1067
|
typedef typename Traits::AccPacket AccPacket;
|
|
1068
|
+
typedef typename Traits::RhsPacketx4 RhsPacketx4;
|
|
1069
|
+
|
|
1070
|
+
typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
|
|
1071
|
+
|
|
1072
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
867
1073
|
|
|
868
|
-
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
|
|
869
1074
|
typedef typename SwappedTraits::ResScalar SResScalar;
|
|
870
1075
|
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
871
1076
|
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
|
872
1077
|
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
873
1078
|
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
874
1079
|
|
|
1080
|
+
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
|
|
1081
|
+
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
|
|
1082
|
+
typedef typename HalfTraits::ResPacket ResPacketHalf;
|
|
1083
|
+
typedef typename HalfTraits::AccPacket AccPacketHalf;
|
|
1084
|
+
|
|
1085
|
+
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
|
|
1086
|
+
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
1087
|
+
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
1088
|
+
typedef typename QuarterTraits::AccPacket AccPacketQuarter;
|
|
1089
|
+
|
|
875
1090
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
876
1091
|
|
|
877
1092
|
enum {
|
|
878
1093
|
Vectorizable = Traits::Vectorizable,
|
|
879
1094
|
LhsProgress = Traits::LhsProgress,
|
|
1095
|
+
LhsProgressHalf = HalfTraits::LhsProgress,
|
|
1096
|
+
LhsProgressQuarter = QuarterTraits::LhsProgress,
|
|
880
1097
|
RhsProgress = Traits::RhsProgress,
|
|
1098
|
+
RhsProgressHalf = HalfTraits::RhsProgress,
|
|
1099
|
+
RhsProgressQuarter = QuarterTraits::RhsProgress,
|
|
881
1100
|
ResPacketSize = Traits::ResPacketSize
|
|
882
1101
|
};
|
|
883
1102
|
|
|
@@ -887,6 +1106,299 @@ struct gebp_kernel
|
|
|
887
1106
|
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
|
|
888
1107
|
};
|
|
889
1108
|
|
|
1109
|
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
|
|
1110
|
+
int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
|
|
1111
|
+
struct last_row_process_16_packets
|
|
1112
|
+
{
|
|
1113
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1114
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
1115
|
+
|
|
1116
|
+
typedef typename Traits::ResScalar ResScalar;
|
|
1117
|
+
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
1118
|
+
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
|
1119
|
+
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
1120
|
+
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
1121
|
+
|
|
1122
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
|
|
1123
|
+
const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
|
|
1124
|
+
ResScalar alpha, SAccPacket &C0)
|
|
1125
|
+
{
|
|
1126
|
+
EIGEN_UNUSED_VARIABLE(res);
|
|
1127
|
+
EIGEN_UNUSED_VARIABLE(straits);
|
|
1128
|
+
EIGEN_UNUSED_VARIABLE(blA);
|
|
1129
|
+
EIGEN_UNUSED_VARIABLE(blB);
|
|
1130
|
+
EIGEN_UNUSED_VARIABLE(depth);
|
|
1131
|
+
EIGEN_UNUSED_VARIABLE(endk);
|
|
1132
|
+
EIGEN_UNUSED_VARIABLE(i);
|
|
1133
|
+
EIGEN_UNUSED_VARIABLE(j2);
|
|
1134
|
+
EIGEN_UNUSED_VARIABLE(alpha);
|
|
1135
|
+
EIGEN_UNUSED_VARIABLE(C0);
|
|
1136
|
+
}
|
|
1137
|
+
};
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
|
1141
|
+
struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
|
|
1142
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1143
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
1144
|
+
|
|
1145
|
+
typedef typename Traits::ResScalar ResScalar;
|
|
1146
|
+
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
1147
|
+
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
|
1148
|
+
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
1149
|
+
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
1150
|
+
|
|
1151
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
|
|
1152
|
+
const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
|
|
1153
|
+
ResScalar alpha, SAccPacket &C0)
|
|
1154
|
+
{
|
|
1155
|
+
typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
|
|
1156
|
+
typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
|
|
1157
|
+
typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
|
|
1158
|
+
typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
|
|
1159
|
+
|
|
1160
|
+
SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
|
|
1161
|
+
SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
|
|
1162
|
+
|
|
1163
|
+
if (depth - endk > 0)
|
|
1164
|
+
{
|
|
1165
|
+
// We have to handle the last row(s) of the rhs, which
|
|
1166
|
+
// correspond to a half-packet
|
|
1167
|
+
SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
|
|
1168
|
+
|
|
1169
|
+
for (Index kk = endk; kk < depth; kk++)
|
|
1170
|
+
{
|
|
1171
|
+
SLhsPacketQuarter a0;
|
|
1172
|
+
SRhsPacketQuarter b0;
|
|
1173
|
+
straits.loadLhsUnaligned(blB, a0);
|
|
1174
|
+
straits.loadRhs(blA, b0);
|
|
1175
|
+
straits.madd(a0,b0,c0,b0, fix<0>);
|
|
1176
|
+
blB += SwappedTraits::LhsProgress/4;
|
|
1177
|
+
blA += 1;
|
|
1178
|
+
}
|
|
1179
|
+
straits.acc(c0, alphav, R);
|
|
1180
|
+
}
|
|
1181
|
+
else
|
|
1182
|
+
{
|
|
1183
|
+
straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
|
|
1184
|
+
}
|
|
1185
|
+
res.scatterPacket(i, j2, R);
|
|
1186
|
+
}
|
|
1187
|
+
};
|
|
1188
|
+
|
|
1189
|
+
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
|
|
1190
|
+
struct lhs_process_one_packet
|
|
1191
|
+
{
|
|
1192
|
+
typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
|
|
1193
|
+
|
|
1194
|
+
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
|
|
1195
|
+
{
|
|
1196
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
|
|
1197
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
|
|
1198
|
+
traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
|
|
1199
|
+
traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
|
|
1200
|
+
traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
|
|
1201
|
+
traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
|
|
1202
|
+
traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
|
|
1203
|
+
traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
|
|
1204
|
+
#if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
|
|
1205
|
+
__asm__ ("" : "+x,m" (*A0));
|
|
1206
|
+
#endif
|
|
1207
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
EIGEN_STRONG_INLINE void operator()(
|
|
1211
|
+
const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
|
|
1212
|
+
Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
|
|
1213
|
+
int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
|
|
1214
|
+
{
|
|
1215
|
+
GEBPTraits traits;
|
|
1216
|
+
|
|
1217
|
+
// loops on each largest micro horizontal panel of lhs
|
|
1218
|
+
// (LhsProgress x depth)
|
|
1219
|
+
for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
|
|
1220
|
+
{
|
|
1221
|
+
// loops on each largest micro vertical panel of rhs (depth * nr)
|
|
1222
|
+
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
|
1223
|
+
{
|
|
1224
|
+
// We select a LhsProgress x nr micro block of res
|
|
1225
|
+
// which is entirely stored into 1 x nr registers.
|
|
1226
|
+
|
|
1227
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
|
|
1228
|
+
prefetch(&blA[0]);
|
|
1229
|
+
|
|
1230
|
+
// gets res block as register
|
|
1231
|
+
AccPacket C0, C1, C2, C3;
|
|
1232
|
+
traits.initAcc(C0);
|
|
1233
|
+
traits.initAcc(C1);
|
|
1234
|
+
traits.initAcc(C2);
|
|
1235
|
+
traits.initAcc(C3);
|
|
1236
|
+
// To improve instruction pipelining, let's double the accumulation registers:
|
|
1237
|
+
// even k will accumulate in C*, while odd k will accumulate in D*.
|
|
1238
|
+
// This trick is crutial to get good performance with FMA, otherwise it is
|
|
1239
|
+
// actually faster to perform separated MUL+ADD because of a naturally
|
|
1240
|
+
// better instruction-level parallelism.
|
|
1241
|
+
AccPacket D0, D1, D2, D3;
|
|
1242
|
+
traits.initAcc(D0);
|
|
1243
|
+
traits.initAcc(D1);
|
|
1244
|
+
traits.initAcc(D2);
|
|
1245
|
+
traits.initAcc(D3);
|
|
1246
|
+
|
|
1247
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1248
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1249
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1250
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1251
|
+
|
|
1252
|
+
r0.prefetch(prefetch_res_offset);
|
|
1253
|
+
r1.prefetch(prefetch_res_offset);
|
|
1254
|
+
r2.prefetch(prefetch_res_offset);
|
|
1255
|
+
r3.prefetch(prefetch_res_offset);
|
|
1256
|
+
|
|
1257
|
+
// performs "inner" products
|
|
1258
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
|
1259
|
+
prefetch(&blB[0]);
|
|
1260
|
+
LhsPacket A0, A1;
|
|
1261
|
+
|
|
1262
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1263
|
+
{
|
|
1264
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
|
|
1265
|
+
RhsPacketx4 rhs_panel;
|
|
1266
|
+
RhsPacket T0;
|
|
1267
|
+
|
|
1268
|
+
internal::prefetch(blB+(48+0));
|
|
1269
|
+
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1270
|
+
peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1271
|
+
peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1272
|
+
peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1273
|
+
internal::prefetch(blB+(48+16));
|
|
1274
|
+
peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1275
|
+
peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1276
|
+
peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1277
|
+
peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1278
|
+
|
|
1279
|
+
blB += pk*4*RhsProgress;
|
|
1280
|
+
blA += pk*LhsProgress;
|
|
1281
|
+
|
|
1282
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
|
|
1283
|
+
}
|
|
1284
|
+
C0 = padd(C0,D0);
|
|
1285
|
+
C1 = padd(C1,D1);
|
|
1286
|
+
C2 = padd(C2,D2);
|
|
1287
|
+
C3 = padd(C3,D3);
|
|
1288
|
+
|
|
1289
|
+
// process remaining peeled loop
|
|
1290
|
+
for(Index k=peeled_kc; k<depth; k++)
|
|
1291
|
+
{
|
|
1292
|
+
RhsPacketx4 rhs_panel;
|
|
1293
|
+
RhsPacket T0;
|
|
1294
|
+
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1295
|
+
blB += 4*RhsProgress;
|
|
1296
|
+
blA += LhsProgress;
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
ResPacket R0, R1;
|
|
1300
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1301
|
+
|
|
1302
|
+
R0 = r0.template loadPacket<ResPacket>(0);
|
|
1303
|
+
R1 = r1.template loadPacket<ResPacket>(0);
|
|
1304
|
+
traits.acc(C0, alphav, R0);
|
|
1305
|
+
traits.acc(C1, alphav, R1);
|
|
1306
|
+
r0.storePacket(0, R0);
|
|
1307
|
+
r1.storePacket(0, R1);
|
|
1308
|
+
|
|
1309
|
+
R0 = r2.template loadPacket<ResPacket>(0);
|
|
1310
|
+
R1 = r3.template loadPacket<ResPacket>(0);
|
|
1311
|
+
traits.acc(C2, alphav, R0);
|
|
1312
|
+
traits.acc(C3, alphav, R1);
|
|
1313
|
+
r2.storePacket(0, R0);
|
|
1314
|
+
r3.storePacket(0, R1);
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
// Deal with remaining columns of the rhs
|
|
1318
|
+
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1319
|
+
{
|
|
1320
|
+
// One column at a time
|
|
1321
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
|
|
1322
|
+
prefetch(&blA[0]);
|
|
1323
|
+
|
|
1324
|
+
// gets res block as register
|
|
1325
|
+
AccPacket C0;
|
|
1326
|
+
traits.initAcc(C0);
|
|
1327
|
+
|
|
1328
|
+
LinearMapper r0 = res.getLinearMapper(i, j2);
|
|
1329
|
+
|
|
1330
|
+
// performs "inner" products
|
|
1331
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
1332
|
+
LhsPacket A0;
|
|
1333
|
+
|
|
1334
|
+
for(Index k= 0; k<peeled_kc; k+=pk)
|
|
1335
|
+
{
|
|
1336
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
|
|
1337
|
+
RhsPacket B_0;
|
|
1338
|
+
|
|
1339
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1340
|
+
do { \
|
|
1341
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
|
|
1342
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1343
|
+
/* FIXME: why unaligned???? */ \
|
|
1344
|
+
traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
|
|
1345
|
+
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
|
1346
|
+
traits.madd(A0, B_0, C0, B_0, fix<0>); \
|
|
1347
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
|
|
1348
|
+
} while(false);
|
|
1349
|
+
|
|
1350
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
1351
|
+
EIGEN_GEBGP_ONESTEP(1);
|
|
1352
|
+
EIGEN_GEBGP_ONESTEP(2);
|
|
1353
|
+
EIGEN_GEBGP_ONESTEP(3);
|
|
1354
|
+
EIGEN_GEBGP_ONESTEP(4);
|
|
1355
|
+
EIGEN_GEBGP_ONESTEP(5);
|
|
1356
|
+
EIGEN_GEBGP_ONESTEP(6);
|
|
1357
|
+
EIGEN_GEBGP_ONESTEP(7);
|
|
1358
|
+
|
|
1359
|
+
blB += pk*RhsProgress;
|
|
1360
|
+
blA += pk*LhsProgress;
|
|
1361
|
+
|
|
1362
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
// process remaining peeled loop
|
|
1366
|
+
for(Index k=peeled_kc; k<depth; k++)
|
|
1367
|
+
{
|
|
1368
|
+
RhsPacket B_0;
|
|
1369
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
1370
|
+
blB += RhsProgress;
|
|
1371
|
+
blA += LhsProgress;
|
|
1372
|
+
}
|
|
1373
|
+
#undef EIGEN_GEBGP_ONESTEP
|
|
1374
|
+
ResPacket R0;
|
|
1375
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1376
|
+
R0 = r0.template loadPacket<ResPacket>(0);
|
|
1377
|
+
traits.acc(C0, alphav, R0);
|
|
1378
|
+
r0.storePacket(0, R0);
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
}
|
|
1382
|
+
};
|
|
1383
|
+
|
|
1384
|
+
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
|
|
1385
|
+
struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
|
|
1386
|
+
{
|
|
1387
|
+
|
|
1388
|
+
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
|
|
1389
|
+
{
|
|
1390
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
|
|
1391
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
|
|
1392
|
+
traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
|
|
1393
|
+
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
|
|
1394
|
+
traits.madd(*A0, *B_0, *C0, *B_0);
|
|
1395
|
+
traits.madd(*A0, *B1, *C1, *B1);
|
|
1396
|
+
traits.madd(*A0, *B2, *C2, *B2);
|
|
1397
|
+
traits.madd(*A0, *B3, *C3, *B3);
|
|
1398
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
|
|
1399
|
+
}
|
|
1400
|
+
};
|
|
1401
|
+
|
|
890
1402
|
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
|
891
1403
|
EIGEN_DONT_INLINE
|
|
892
1404
|
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
|
|
@@ -903,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
903
1415
|
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
904
1416
|
const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
|
|
905
1417
|
const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
|
|
906
|
-
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
|
|
1418
|
+
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
|
|
1419
|
+
const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
|
|
1420
|
+
const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
|
|
907
1421
|
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
|
|
908
1422
|
const Index peeled_kc = depth & ~(pk-1);
|
|
909
|
-
const
|
|
1423
|
+
const int prefetch_res_offset = 32/sizeof(ResScalar);
|
|
910
1424
|
// const Index depth2 = depth & ~1;
|
|
911
1425
|
|
|
912
1426
|
//---------- Process 3 * LhsProgress rows at once ----------
|
|
@@ -964,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
964
1478
|
for(Index k=0; k<peeled_kc; k+=pk)
|
|
965
1479
|
{
|
|
966
1480
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
|
|
967
|
-
|
|
1481
|
+
// 15 registers are taken (12 for acc, 2 for lhs).
|
|
1482
|
+
RhsPanel15 rhs_panel;
|
|
1483
|
+
RhsPacket T0;
|
|
968
1484
|
LhsPacket A2;
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
1485
|
+
#if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
|
|
1486
|
+
// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
|
|
1487
|
+
// without this workaround A0, A1, and A2 are loaded in the same register,
|
|
1488
|
+
// which is not good for pipelining
|
|
1489
|
+
#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
|
|
1490
|
+
#else
|
|
1491
|
+
#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
|
|
1492
|
+
#endif
|
|
1493
|
+
#define EIGEN_GEBP_ONESTEP(K) \
|
|
1494
|
+
do { \
|
|
1495
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
|
|
973
1496
|
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
974
|
-
internal::prefetch(blA+(3*K+16)*LhsProgress);
|
|
975
|
-
if (EIGEN_ARCH_ARM
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
traits.loadLhs(&blA[(
|
|
979
|
-
traits.
|
|
980
|
-
traits.
|
|
981
|
-
|
|
982
|
-
traits.
|
|
983
|
-
traits.
|
|
984
|
-
traits.madd(
|
|
985
|
-
traits.madd(
|
|
986
|
-
traits.
|
|
987
|
-
traits.
|
|
988
|
-
traits.madd(
|
|
989
|
-
traits.madd(
|
|
990
|
-
traits.
|
|
991
|
-
traits.
|
|
992
|
-
traits.madd(
|
|
993
|
-
traits.madd(
|
|
994
|
-
traits.
|
|
995
|
-
|
|
996
|
-
|
|
1497
|
+
internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
|
|
1498
|
+
if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
|
|
1499
|
+
internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
|
|
1500
|
+
} /* Bug 953 */ \
|
|
1501
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1502
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1503
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1504
|
+
EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
|
|
1505
|
+
traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1506
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1507
|
+
traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
|
|
1508
|
+
traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
|
|
1509
|
+
traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1510
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1511
|
+
traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
|
|
1512
|
+
traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
|
|
1513
|
+
traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1514
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1515
|
+
traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
|
|
1516
|
+
traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
|
|
1517
|
+
traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1518
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1519
|
+
traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
|
|
1520
|
+
traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
|
|
1521
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
|
|
1522
|
+
} while (false)
|
|
997
1523
|
|
|
998
1524
|
internal::prefetch(blB);
|
|
999
1525
|
EIGEN_GEBP_ONESTEP(0);
|
|
@@ -1013,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1013
1539
|
// process remaining peeled loop
|
|
1014
1540
|
for(Index k=peeled_kc; k<depth; k++)
|
|
1015
1541
|
{
|
|
1016
|
-
|
|
1542
|
+
RhsPanel15 rhs_panel;
|
|
1543
|
+
RhsPacket T0;
|
|
1017
1544
|
LhsPacket A2;
|
|
1018
1545
|
EIGEN_GEBP_ONESTEP(0);
|
|
1019
1546
|
blB += 4*RhsProgress;
|
|
@@ -1025,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1025
1552
|
ResPacket R0, R1, R2;
|
|
1026
1553
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1027
1554
|
|
|
1028
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1029
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1030
|
-
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
|
1555
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1556
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1557
|
+
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1031
1558
|
traits.acc(C0, alphav, R0);
|
|
1032
1559
|
traits.acc(C4, alphav, R1);
|
|
1033
1560
|
traits.acc(C8, alphav, R2);
|
|
@@ -1035,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1035
1562
|
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1036
1563
|
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1037
1564
|
|
|
1038
|
-
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
|
|
1039
|
-
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
|
|
1040
|
-
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
|
|
1565
|
+
R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1566
|
+
R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1567
|
+
R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1041
1568
|
traits.acc(C1, alphav, R0);
|
|
1042
1569
|
traits.acc(C5, alphav, R1);
|
|
1043
1570
|
traits.acc(C9, alphav, R2);
|
|
@@ -1045,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1045
1572
|
r1.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1046
1573
|
r1.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1047
1574
|
|
|
1048
|
-
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
|
1049
|
-
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
|
1050
|
-
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
|
|
1575
|
+
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1576
|
+
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1577
|
+
R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1051
1578
|
traits.acc(C2, alphav, R0);
|
|
1052
1579
|
traits.acc(C6, alphav, R1);
|
|
1053
1580
|
traits.acc(C10, alphav, R2);
|
|
@@ -1055,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1055
1582
|
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1056
1583
|
r2.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1057
1584
|
|
|
1058
|
-
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
|
|
1059
|
-
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
|
|
1060
|
-
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
|
|
1585
|
+
R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1586
|
+
R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1587
|
+
R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1061
1588
|
traits.acc(C3, alphav, R0);
|
|
1062
1589
|
traits.acc(C7, alphav, R1);
|
|
1063
1590
|
traits.acc(C11, alphav, R2);
|
|
@@ -1093,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1093
1620
|
{
|
|
1094
1621
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
|
|
1095
1622
|
RhsPacket B_0;
|
|
1096
|
-
#define EIGEN_GEBGP_ONESTEP(K)
|
|
1097
|
-
do {
|
|
1098
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");
|
|
1623
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1624
|
+
do { \
|
|
1625
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
|
|
1099
1626
|
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1100
|
-
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);
|
|
1101
|
-
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);
|
|
1102
|
-
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);
|
|
1103
|
-
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);
|
|
1104
|
-
traits.madd(A0, B_0, C0, B_0);
|
|
1105
|
-
traits.madd(A1, B_0, C4, B_0);
|
|
1106
|
-
traits.madd(A2, B_0, C8, B_0);
|
|
1107
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");
|
|
1108
|
-
} while(false)
|
|
1109
|
-
|
|
1627
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1628
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1629
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1630
|
+
traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
|
|
1631
|
+
traits.madd(A0, B_0, C0, B_0, fix<0>); \
|
|
1632
|
+
traits.madd(A1, B_0, C4, B_0, fix<0>); \
|
|
1633
|
+
traits.madd(A2, B_0, C8, B_0, fix<0>); \
|
|
1634
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
|
|
1635
|
+
} while (false)
|
|
1636
|
+
|
|
1110
1637
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1111
1638
|
EIGEN_GEBGP_ONESTEP(1);
|
|
1112
1639
|
EIGEN_GEBGP_ONESTEP(2);
|
|
@@ -1116,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1116
1643
|
EIGEN_GEBGP_ONESTEP(6);
|
|
1117
1644
|
EIGEN_GEBGP_ONESTEP(7);
|
|
1118
1645
|
|
|
1119
|
-
blB += pk*RhsProgress;
|
|
1120
|
-
blA += pk*3*Traits::LhsProgress;
|
|
1646
|
+
blB += int(pk) * int(RhsProgress);
|
|
1647
|
+
blA += int(pk) * 3 * int(Traits::LhsProgress);
|
|
1121
1648
|
|
|
1122
1649
|
EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
|
|
1123
1650
|
}
|
|
@@ -1134,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1134
1661
|
ResPacket R0, R1, R2;
|
|
1135
1662
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1136
1663
|
|
|
1137
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1138
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1139
|
-
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
|
1664
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1665
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1666
|
+
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1140
1667
|
traits.acc(C0, alphav, R0);
|
|
1141
1668
|
traits.acc(C4, alphav, R1);
|
|
1142
1669
|
traits.acc(C8, alphav, R2);
|
|
@@ -1195,7 +1722,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1195
1722
|
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1196
1723
|
{
|
|
1197
1724
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
|
|
1198
|
-
|
|
1725
|
+
RhsPacketx4 rhs_panel;
|
|
1726
|
+
RhsPacket T0;
|
|
1199
1727
|
|
|
1200
1728
|
// NOTE: the begin/end asm comments below work around bug 935!
|
|
1201
1729
|
// but they are not enough for gcc>=6 without FMA (bug 1637)
|
|
@@ -1204,24 +1732,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1204
1732
|
#else
|
|
1205
1733
|
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
|
1206
1734
|
#endif
|
|
1207
|
-
|
|
1208
|
-
do {
|
|
1209
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");
|
|
1210
|
-
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);
|
|
1211
|
-
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);
|
|
1212
|
-
traits.
|
|
1213
|
-
traits.madd(A0,
|
|
1214
|
-
traits.madd(A1,
|
|
1215
|
-
traits.madd(A0,
|
|
1216
|
-
traits.madd(A1,
|
|
1217
|
-
traits.madd(A0,
|
|
1218
|
-
traits.madd(A1,
|
|
1219
|
-
traits.madd(A0,
|
|
1220
|
-
traits.madd(A1,
|
|
1221
|
-
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
|
1222
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");
|
|
1223
|
-
} while(false)
|
|
1224
|
-
|
|
1735
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1736
|
+
do { \
|
|
1737
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
|
|
1738
|
+
traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
|
|
1739
|
+
traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
|
|
1740
|
+
traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
|
|
1741
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1742
|
+
traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
|
|
1743
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1744
|
+
traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
|
|
1745
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1746
|
+
traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
|
|
1747
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1748
|
+
traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
|
|
1749
|
+
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
|
|
1750
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
|
|
1751
|
+
} while (false)
|
|
1752
|
+
|
|
1225
1753
|
internal::prefetch(blB+(48+0));
|
|
1226
1754
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1227
1755
|
EIGEN_GEBGP_ONESTEP(1);
|
|
@@ -1241,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1241
1769
|
// process remaining peeled loop
|
|
1242
1770
|
for(Index k=peeled_kc; k<depth; k++)
|
|
1243
1771
|
{
|
|
1244
|
-
|
|
1772
|
+
RhsPacketx4 rhs_panel;
|
|
1773
|
+
RhsPacket T0;
|
|
1245
1774
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1246
1775
|
blB += 4*RhsProgress;
|
|
1247
1776
|
blA += 2*Traits::LhsProgress;
|
|
@@ -1251,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1251
1780
|
ResPacket R0, R1, R2, R3;
|
|
1252
1781
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1253
1782
|
|
|
1254
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1255
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1256
|
-
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
|
|
1257
|
-
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
|
|
1783
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1784
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1785
|
+
R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1786
|
+
R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1258
1787
|
traits.acc(C0, alphav, R0);
|
|
1259
1788
|
traits.acc(C4, alphav, R1);
|
|
1260
1789
|
traits.acc(C1, alphav, R2);
|
|
@@ -1264,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1264
1793
|
r1.storePacket(0 * Traits::ResPacketSize, R2);
|
|
1265
1794
|
r1.storePacket(1 * Traits::ResPacketSize, R3);
|
|
1266
1795
|
|
|
1267
|
-
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
|
1268
|
-
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
|
1269
|
-
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
|
|
1270
|
-
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
|
|
1796
|
+
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1797
|
+
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1798
|
+
R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1799
|
+
R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1271
1800
|
traits.acc(C2, alphav, R0);
|
|
1272
1801
|
traits.acc(C6, alphav, R1);
|
|
1273
1802
|
traits.acc(C3, alphav, R2);
|
|
@@ -1312,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1312
1841
|
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
|
|
1313
1842
|
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
|
|
1314
1843
|
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
|
1315
|
-
traits.madd(A0, B_0, C0, B1);
|
|
1316
|
-
traits.madd(A1, B_0, C4, B_0);
|
|
1844
|
+
traits.madd(A0, B_0, C0, B1, fix<0>); \
|
|
1845
|
+
traits.madd(A1, B_0, C4, B_0, fix<0>); \
|
|
1317
1846
|
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
|
|
1318
1847
|
} while(false)
|
|
1319
1848
|
|
|
@@ -1326,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1326
1855
|
EIGEN_GEBGP_ONESTEP(6);
|
|
1327
1856
|
EIGEN_GEBGP_ONESTEP(7);
|
|
1328
1857
|
|
|
1329
|
-
blB += pk*RhsProgress;
|
|
1330
|
-
blA += pk*2*Traits::LhsProgress;
|
|
1858
|
+
blB += int(pk) * int(RhsProgress);
|
|
1859
|
+
blA += int(pk) * 2 * int(Traits::LhsProgress);
|
|
1331
1860
|
|
|
1332
1861
|
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
|
|
1333
1862
|
}
|
|
@@ -1344,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1344
1873
|
ResPacket R0, R1;
|
|
1345
1874
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1346
1875
|
|
|
1347
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1348
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1876
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1877
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1349
1878
|
traits.acc(C0, alphav, R0);
|
|
1350
1879
|
traits.acc(C4, alphav, R1);
|
|
1351
1880
|
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
@@ -1357,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1357
1886
|
//---------- Process 1 * LhsProgress rows at once ----------
|
|
1358
1887
|
if(mr>=1*Traits::LhsProgress)
|
|
1359
1888
|
{
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
traits.initAcc(C0);
|
|
1375
|
-
traits.initAcc(C1);
|
|
1376
|
-
traits.initAcc(C2);
|
|
1377
|
-
traits.initAcc(C3);
|
|
1378
|
-
|
|
1379
|
-
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1380
|
-
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1381
|
-
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1382
|
-
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1383
|
-
|
|
1384
|
-
r0.prefetch(prefetch_res_offset);
|
|
1385
|
-
r1.prefetch(prefetch_res_offset);
|
|
1386
|
-
r2.prefetch(prefetch_res_offset);
|
|
1387
|
-
r3.prefetch(prefetch_res_offset);
|
|
1388
|
-
|
|
1389
|
-
// performs "inner" products
|
|
1390
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
|
1391
|
-
prefetch(&blB[0]);
|
|
1392
|
-
LhsPacket A0;
|
|
1393
|
-
|
|
1394
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1395
|
-
{
|
|
1396
|
-
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
|
|
1397
|
-
RhsPacket B_0, B1, B2, B3;
|
|
1398
|
-
|
|
1399
|
-
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1400
|
-
do { \
|
|
1401
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
|
|
1402
|
-
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1403
|
-
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
|
|
1404
|
-
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
|
|
1405
|
-
traits.madd(A0, B_0, C0, B_0); \
|
|
1406
|
-
traits.madd(A0, B1, C1, B1); \
|
|
1407
|
-
traits.madd(A0, B2, C2, B2); \
|
|
1408
|
-
traits.madd(A0, B3, C3, B3); \
|
|
1409
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
|
|
1410
|
-
} while(false)
|
|
1411
|
-
|
|
1412
|
-
internal::prefetch(blB+(48+0));
|
|
1413
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1414
|
-
EIGEN_GEBGP_ONESTEP(1);
|
|
1415
|
-
EIGEN_GEBGP_ONESTEP(2);
|
|
1416
|
-
EIGEN_GEBGP_ONESTEP(3);
|
|
1417
|
-
internal::prefetch(blB+(48+16));
|
|
1418
|
-
EIGEN_GEBGP_ONESTEP(4);
|
|
1419
|
-
EIGEN_GEBGP_ONESTEP(5);
|
|
1420
|
-
EIGEN_GEBGP_ONESTEP(6);
|
|
1421
|
-
EIGEN_GEBGP_ONESTEP(7);
|
|
1422
|
-
|
|
1423
|
-
blB += pk*4*RhsProgress;
|
|
1424
|
-
blA += pk*1*LhsProgress;
|
|
1425
|
-
|
|
1426
|
-
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
|
|
1427
|
-
}
|
|
1428
|
-
// process remaining peeled loop
|
|
1429
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1430
|
-
{
|
|
1431
|
-
RhsPacket B_0, B1, B2, B3;
|
|
1432
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1433
|
-
blB += 4*RhsProgress;
|
|
1434
|
-
blA += 1*LhsProgress;
|
|
1435
|
-
}
|
|
1436
|
-
#undef EIGEN_GEBGP_ONESTEP
|
|
1437
|
-
|
|
1438
|
-
ResPacket R0, R1;
|
|
1439
|
-
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1440
|
-
|
|
1441
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1442
|
-
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
|
|
1443
|
-
traits.acc(C0, alphav, R0);
|
|
1444
|
-
traits.acc(C1, alphav, R1);
|
|
1445
|
-
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1446
|
-
r1.storePacket(0 * Traits::ResPacketSize, R1);
|
|
1447
|
-
|
|
1448
|
-
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
|
1449
|
-
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
|
|
1450
|
-
traits.acc(C2, alphav, R0);
|
|
1451
|
-
traits.acc(C3, alphav, R1);
|
|
1452
|
-
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1453
|
-
r3.storePacket(0 * Traits::ResPacketSize, R1);
|
|
1454
|
-
}
|
|
1455
|
-
|
|
1456
|
-
// Deal with remaining columns of the rhs
|
|
1457
|
-
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1458
|
-
{
|
|
1459
|
-
// One column at a time
|
|
1460
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
|
|
1461
|
-
prefetch(&blA[0]);
|
|
1462
|
-
|
|
1463
|
-
// gets res block as register
|
|
1464
|
-
AccPacket C0;
|
|
1465
|
-
traits.initAcc(C0);
|
|
1466
|
-
|
|
1467
|
-
LinearMapper r0 = res.getLinearMapper(i, j2);
|
|
1468
|
-
|
|
1469
|
-
// performs "inner" products
|
|
1470
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
1471
|
-
LhsPacket A0;
|
|
1472
|
-
|
|
1473
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1474
|
-
{
|
|
1475
|
-
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
|
|
1476
|
-
RhsPacket B_0;
|
|
1477
|
-
|
|
1478
|
-
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1479
|
-
do { \
|
|
1480
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
|
|
1481
|
-
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1482
|
-
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
|
|
1483
|
-
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
|
1484
|
-
traits.madd(A0, B_0, C0, B_0); \
|
|
1485
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
|
|
1486
|
-
} while(false);
|
|
1487
|
-
|
|
1488
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1489
|
-
EIGEN_GEBGP_ONESTEP(1);
|
|
1490
|
-
EIGEN_GEBGP_ONESTEP(2);
|
|
1491
|
-
EIGEN_GEBGP_ONESTEP(3);
|
|
1492
|
-
EIGEN_GEBGP_ONESTEP(4);
|
|
1493
|
-
EIGEN_GEBGP_ONESTEP(5);
|
|
1494
|
-
EIGEN_GEBGP_ONESTEP(6);
|
|
1495
|
-
EIGEN_GEBGP_ONESTEP(7);
|
|
1496
|
-
|
|
1497
|
-
blB += pk*RhsProgress;
|
|
1498
|
-
blA += pk*1*Traits::LhsProgress;
|
|
1499
|
-
|
|
1500
|
-
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
|
|
1501
|
-
}
|
|
1502
|
-
|
|
1503
|
-
// process remaining peeled loop
|
|
1504
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1505
|
-
{
|
|
1506
|
-
RhsPacket B_0;
|
|
1507
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1508
|
-
blB += RhsProgress;
|
|
1509
|
-
blA += 1*Traits::LhsProgress;
|
|
1510
|
-
}
|
|
1511
|
-
#undef EIGEN_GEBGP_ONESTEP
|
|
1512
|
-
ResPacket R0;
|
|
1513
|
-
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1514
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1515
|
-
traits.acc(C0, alphav, R0);
|
|
1516
|
-
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1517
|
-
}
|
|
1518
|
-
}
|
|
1889
|
+
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
|
|
1890
|
+
p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
1891
|
+
}
|
|
1892
|
+
//---------- Process LhsProgressHalf rows at once ----------
|
|
1893
|
+
if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
|
|
1894
|
+
{
|
|
1895
|
+
lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
|
|
1896
|
+
p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
1897
|
+
}
|
|
1898
|
+
//---------- Process LhsProgressQuarter rows at once ----------
|
|
1899
|
+
if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
|
|
1900
|
+
{
|
|
1901
|
+
lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
|
|
1902
|
+
p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
1519
1903
|
}
|
|
1520
1904
|
//---------- Process remaining rows, 1 at once ----------
|
|
1521
|
-
if(
|
|
1905
|
+
if(peeled_mc_quarter<rows)
|
|
1522
1906
|
{
|
|
1523
1907
|
// loop on each panel of the rhs
|
|
1524
1908
|
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
|
1525
1909
|
{
|
|
1526
1910
|
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
1527
|
-
for(Index i=
|
|
1911
|
+
for(Index i=peeled_mc_quarter; i<rows; i+=1)
|
|
1528
1912
|
{
|
|
1529
1913
|
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
|
1530
1914
|
prefetch(&blA[0]);
|
|
1531
1915
|
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
|
1532
1916
|
|
|
1533
|
-
//
|
|
1534
|
-
//
|
|
1535
|
-
//
|
|
1917
|
+
// If LhsProgress is 8 or 16, it assumes that there is a
|
|
1918
|
+
// half or quarter packet, respectively, of the same size as
|
|
1919
|
+
// nr (which is currently 4) for the return type.
|
|
1536
1920
|
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
|
1921
|
+
const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
|
|
1537
1922
|
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
|
1538
|
-
(SwappedTraits::LhsProgress
|
|
1539
|
-
(SwappedTraits::LhsProgress!=8
|
|
1923
|
+
(SwappedTraits::LhsProgress<=16) &&
|
|
1924
|
+
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
|
|
1925
|
+
(SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
|
|
1540
1926
|
{
|
|
1541
1927
|
SAccPacket C0, C1, C2, C3;
|
|
1542
1928
|
straits.initAcc(C0);
|
|
@@ -1559,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1559
1945
|
|
|
1560
1946
|
straits.loadRhsQuad(blA+0*spk, B_0);
|
|
1561
1947
|
straits.loadRhsQuad(blA+1*spk, B_1);
|
|
1562
|
-
straits.madd(A0,B_0,C0,B_0);
|
|
1563
|
-
straits.madd(A1,B_1,C1,B_1);
|
|
1948
|
+
straits.madd(A0,B_0,C0,B_0, fix<0>);
|
|
1949
|
+
straits.madd(A1,B_1,C1,B_1, fix<0>);
|
|
1564
1950
|
|
|
1565
1951
|
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
|
|
1566
1952
|
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
|
|
1567
1953
|
straits.loadRhsQuad(blA+2*spk, B_0);
|
|
1568
1954
|
straits.loadRhsQuad(blA+3*spk, B_1);
|
|
1569
|
-
straits.madd(A0,B_0,C2,B_0);
|
|
1570
|
-
straits.madd(A1,B_1,C3,B_1);
|
|
1955
|
+
straits.madd(A0,B_0,C2,B_0, fix<0>);
|
|
1956
|
+
straits.madd(A1,B_1,C3,B_1, fix<0>);
|
|
1571
1957
|
|
|
1572
1958
|
blB += 4*SwappedTraits::LhsProgress;
|
|
1573
1959
|
blA += 4*spk;
|
|
@@ -1580,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1580
1966
|
|
|
1581
1967
|
straits.loadLhsUnaligned(blB, A0);
|
|
1582
1968
|
straits.loadRhsQuad(blA, B_0);
|
|
1583
|
-
straits.madd(A0,B_0,C0,B_0);
|
|
1969
|
+
straits.madd(A0,B_0,C0,B_0, fix<0>);
|
|
1584
1970
|
|
|
1585
1971
|
blB += SwappedTraits::LhsProgress;
|
|
1586
1972
|
blA += spk;
|
|
@@ -1590,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1590
1976
|
// Special case where we have to first reduce the accumulation register C0
|
|
1591
1977
|
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
|
|
1592
1978
|
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
|
|
1593
|
-
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<
|
|
1979
|
+
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
|
|
1594
1980
|
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
|
|
1595
1981
|
|
|
1596
1982
|
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
|
|
@@ -1603,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1603
1989
|
SRhsPacketHalf b0;
|
|
1604
1990
|
straits.loadLhsUnaligned(blB, a0);
|
|
1605
1991
|
straits.loadRhs(blA, b0);
|
|
1606
|
-
SAccPacketHalf c0 =
|
|
1607
|
-
straits.madd(a0,b0,c0,b0);
|
|
1992
|
+
SAccPacketHalf c0 = predux_half_dowto4(C0);
|
|
1993
|
+
straits.madd(a0,b0,c0,b0, fix<0>);
|
|
1608
1994
|
straits.acc(c0, alphav, R);
|
|
1609
1995
|
}
|
|
1610
1996
|
else
|
|
1611
1997
|
{
|
|
1612
|
-
straits.acc(
|
|
1998
|
+
straits.acc(predux_half_dowto4(C0), alphav, R);
|
|
1613
1999
|
}
|
|
1614
2000
|
res.scatterPacket(i, j2, R);
|
|
1615
2001
|
}
|
|
2002
|
+
else if (SwappedTraits::LhsProgress==16)
|
|
2003
|
+
{
|
|
2004
|
+
// Special case where we have to first reduce the
|
|
2005
|
+
// accumulation register C0. We specialize the block in
|
|
2006
|
+
// template form, so that LhsProgress < 16 paths don't
|
|
2007
|
+
// fail to compile
|
|
2008
|
+
last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
|
|
2009
|
+
p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
|
|
2010
|
+
}
|
|
1616
2011
|
else
|
|
1617
2012
|
{
|
|
1618
2013
|
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
|
|
@@ -1635,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1635
2030
|
|
|
1636
2031
|
B_0 = blB[0];
|
|
1637
2032
|
B_1 = blB[1];
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
2033
|
+
C0 = cj.pmadd(A0,B_0,C0);
|
|
2034
|
+
C1 = cj.pmadd(A0,B_1,C1);
|
|
2035
|
+
|
|
1641
2036
|
B_0 = blB[2];
|
|
1642
2037
|
B_1 = blB[3];
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
2038
|
+
C2 = cj.pmadd(A0,B_0,C2);
|
|
2039
|
+
C3 = cj.pmadd(A0,B_1,C3);
|
|
2040
|
+
|
|
1646
2041
|
blB += 4;
|
|
1647
2042
|
}
|
|
1648
2043
|
res(i, j2 + 0) += alpha * C0;
|
|
@@ -1656,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1656
2051
|
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1657
2052
|
{
|
|
1658
2053
|
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
1659
|
-
for(Index i=
|
|
2054
|
+
for(Index i=peeled_mc_quarter; i<rows; i+=1)
|
|
1660
2055
|
{
|
|
1661
2056
|
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
|
1662
2057
|
prefetch(&blA[0]);
|
|
@@ -1667,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1667
2062
|
{
|
|
1668
2063
|
LhsScalar A0 = blA[k];
|
|
1669
2064
|
RhsScalar B_0 = blB[k];
|
|
1670
|
-
|
|
2065
|
+
C0 = cj.pmadd(A0, B_0, C0);
|
|
1671
2066
|
}
|
|
1672
2067
|
res(i, j2) += alpha * C0;
|
|
1673
2068
|
}
|
|
@@ -1676,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1676
2071
|
}
|
|
1677
2072
|
|
|
1678
2073
|
|
|
1679
|
-
#undef CJMADD
|
|
1680
|
-
|
|
1681
2074
|
// pack a block of the lhs
|
|
1682
2075
|
// The traversal is as follow (mr==4):
|
|
1683
2076
|
// 0 4 8 12 ...
|
|
@@ -1692,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1692
2085
|
//
|
|
1693
2086
|
// 32 33 34 35 ...
|
|
1694
2087
|
// 36 36 38 39 ...
|
|
1695
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1696
|
-
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
|
2088
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2089
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
|
|
1697
2090
|
{
|
|
1698
2091
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
1699
2092
|
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
|
1700
2093
|
};
|
|
1701
2094
|
|
|
1702
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1703
|
-
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
|
2095
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2096
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
|
|
1704
2097
|
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
|
1705
2098
|
{
|
|
1706
|
-
typedef typename
|
|
1707
|
-
|
|
2099
|
+
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2100
|
+
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2101
|
+
enum { PacketSize = unpacket_traits<Packet>::size,
|
|
2102
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2103
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
2104
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
2105
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
|
|
1708
2106
|
|
|
1709
2107
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
|
1710
2108
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
@@ -1716,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1716
2114
|
|
|
1717
2115
|
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
|
1718
2116
|
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
|
1719
|
-
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
|
1720
|
-
const Index
|
|
1721
|
-
|
|
2117
|
+
const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
|
|
2118
|
+
const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
|
|
2119
|
+
const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
|
|
2120
|
+
const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
|
|
2121
|
+
const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
|
|
2122
|
+
: Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
|
|
1722
2123
|
|
|
1723
2124
|
Index i=0;
|
|
1724
2125
|
|
|
@@ -1732,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1732
2133
|
for(Index k=0; k<depth; k++)
|
|
1733
2134
|
{
|
|
1734
2135
|
Packet A, B, C;
|
|
1735
|
-
A = lhs.loadPacket(i+0*PacketSize, k);
|
|
1736
|
-
B = lhs.loadPacket(i+1*PacketSize, k);
|
|
1737
|
-
C = lhs.loadPacket(i+2*PacketSize, k);
|
|
2136
|
+
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2137
|
+
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
|
|
2138
|
+
C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
|
|
1738
2139
|
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
|
1739
2140
|
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
|
1740
2141
|
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
|
|
@@ -1752,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1752
2153
|
for(Index k=0; k<depth; k++)
|
|
1753
2154
|
{
|
|
1754
2155
|
Packet A, B;
|
|
1755
|
-
A = lhs.loadPacket(i+0*PacketSize, k);
|
|
1756
|
-
B = lhs.loadPacket(i+1*PacketSize, k);
|
|
2156
|
+
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2157
|
+
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
|
|
1757
2158
|
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
|
1758
2159
|
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
|
1759
2160
|
}
|
|
@@ -1770,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1770
2171
|
for(Index k=0; k<depth; k++)
|
|
1771
2172
|
{
|
|
1772
2173
|
Packet A;
|
|
1773
|
-
A = lhs.loadPacket(i+0*PacketSize, k);
|
|
2174
|
+
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
1774
2175
|
pstore(blockA+count, cj.pconj(A));
|
|
1775
2176
|
count+=PacketSize;
|
|
1776
2177
|
}
|
|
1777
2178
|
if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
|
|
1778
2179
|
}
|
|
1779
2180
|
}
|
|
1780
|
-
// Pack
|
|
2181
|
+
// Pack half packets
|
|
2182
|
+
if(HasHalf && Pack1>=HalfPacketSize)
|
|
2183
|
+
{
|
|
2184
|
+
for(; i<peeled_mc_half; i+=HalfPacketSize)
|
|
2185
|
+
{
|
|
2186
|
+
if(PanelMode) count += (HalfPacketSize) * offset;
|
|
2187
|
+
|
|
2188
|
+
for(Index k=0; k<depth; k++)
|
|
2189
|
+
{
|
|
2190
|
+
HalfPacket A;
|
|
2191
|
+
A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
|
|
2192
|
+
pstoreu(blockA+count, cj.pconj(A));
|
|
2193
|
+
count+=HalfPacketSize;
|
|
2194
|
+
}
|
|
2195
|
+
if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
// Pack quarter packets
|
|
2199
|
+
if(HasQuarter && Pack1>=QuarterPacketSize)
|
|
2200
|
+
{
|
|
2201
|
+
for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
|
|
2202
|
+
{
|
|
2203
|
+
if(PanelMode) count += (QuarterPacketSize) * offset;
|
|
2204
|
+
|
|
2205
|
+
for(Index k=0; k<depth; k++)
|
|
2206
|
+
{
|
|
2207
|
+
QuarterPacket A;
|
|
2208
|
+
A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
|
|
2209
|
+
pstoreu(blockA+count, cj.pconj(A));
|
|
2210
|
+
count+=QuarterPacketSize;
|
|
2211
|
+
}
|
|
2212
|
+
if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
|
|
2213
|
+
}
|
|
2214
|
+
}
|
|
2215
|
+
// Pack2 may be *smaller* than PacketSize—that happens for
|
|
2216
|
+
// products like real * complex, where we have to go half the
|
|
2217
|
+
// progress on the lhs in order to duplicate those operands to
|
|
2218
|
+
// address both real & imaginary parts on the rhs. This portion will
|
|
2219
|
+
// pack those half ones until they match the number expected on the
|
|
2220
|
+
// last peeling loop at this point (for the rhs).
|
|
1781
2221
|
if(Pack2<PacketSize && Pack2>1)
|
|
1782
2222
|
{
|
|
1783
|
-
for(; i<peeled_mc0; i+=
|
|
2223
|
+
for(; i<peeled_mc0; i+=last_lhs_progress)
|
|
1784
2224
|
{
|
|
1785
|
-
if(PanelMode) count +=
|
|
2225
|
+
if(PanelMode) count += last_lhs_progress * offset;
|
|
1786
2226
|
|
|
1787
2227
|
for(Index k=0; k<depth; k++)
|
|
1788
|
-
for(Index w=0; w<
|
|
2228
|
+
for(Index w=0; w<last_lhs_progress; w++)
|
|
1789
2229
|
blockA[count++] = cj(lhs(i+w, k));
|
|
1790
2230
|
|
|
1791
|
-
if(PanelMode) count +=
|
|
2231
|
+
if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
|
|
1792
2232
|
}
|
|
1793
2233
|
}
|
|
2234
|
+
// Pack scalars
|
|
1794
2235
|
for(; i<rows; i++)
|
|
1795
2236
|
{
|
|
1796
2237
|
if(PanelMode) count += offset;
|
|
@@ -1800,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1800
2241
|
}
|
|
1801
2242
|
}
|
|
1802
2243
|
|
|
1803
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1804
|
-
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
|
2244
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2245
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
|
|
1805
2246
|
{
|
|
1806
2247
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
1807
2248
|
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
|
1808
2249
|
};
|
|
1809
2250
|
|
|
1810
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1811
|
-
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
|
2251
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2252
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
|
|
1812
2253
|
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
|
1813
2254
|
{
|
|
1814
|
-
typedef typename
|
|
1815
|
-
|
|
2255
|
+
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2256
|
+
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2257
|
+
enum { PacketSize = unpacket_traits<Packet>::size,
|
|
2258
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2259
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
2260
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
2261
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
|
|
1816
2262
|
|
|
1817
2263
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
|
1818
2264
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
@@ -1820,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
|
|
|
1820
2266
|
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
1821
2267
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
1822
2268
|
Index count = 0;
|
|
2269
|
+
bool gone_half = false, gone_quarter = false, gone_last = false;
|
|
1823
2270
|
|
|
1824
|
-
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
|
1825
|
-
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
|
1826
|
-
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
|
1827
|
-
|
|
1828
|
-
int pack = Pack1;
|
|
1829
2271
|
Index i = 0;
|
|
2272
|
+
int pack = Pack1;
|
|
2273
|
+
int psize = PacketSize;
|
|
1830
2274
|
while(pack>0)
|
|
1831
2275
|
{
|
|
1832
2276
|
Index remaining_rows = rows-i;
|
|
1833
|
-
Index peeled_mc = i+(remaining_rows/pack)*pack;
|
|
2277
|
+
Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
|
|
2278
|
+
Index starting_pos = i;
|
|
1834
2279
|
for(; i<peeled_mc; i+=pack)
|
|
1835
2280
|
{
|
|
1836
2281
|
if(PanelMode) count += pack * offset;
|
|
1837
2282
|
|
|
1838
|
-
const Index peeled_k = (depth/PacketSize)*PacketSize;
|
|
1839
2283
|
Index k=0;
|
|
1840
|
-
if(pack>=
|
|
2284
|
+
if(pack>=psize && psize >= QuarterPacketSize)
|
|
1841
2285
|
{
|
|
1842
|
-
|
|
2286
|
+
const Index peeled_k = (depth/psize)*psize;
|
|
2287
|
+
for(; k<peeled_k; k+=psize)
|
|
1843
2288
|
{
|
|
1844
|
-
for (Index m = 0; m < pack; m +=
|
|
2289
|
+
for (Index m = 0; m < pack; m += psize)
|
|
1845
2290
|
{
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
2291
|
+
if (psize == PacketSize) {
|
|
2292
|
+
PacketBlock<Packet> kernel;
|
|
2293
|
+
for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
|
|
2294
|
+
ptranspose(kernel);
|
|
2295
|
+
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
|
|
2296
|
+
} else if (HasHalf && psize == HalfPacketSize) {
|
|
2297
|
+
gone_half = true;
|
|
2298
|
+
PacketBlock<HalfPacket> kernel_half;
|
|
2299
|
+
for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
|
|
2300
|
+
ptranspose(kernel_half);
|
|
2301
|
+
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
|
|
2302
|
+
} else if (HasQuarter && psize == QuarterPacketSize) {
|
|
2303
|
+
gone_quarter = true;
|
|
2304
|
+
PacketBlock<QuarterPacket> kernel_quarter;
|
|
2305
|
+
for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
|
|
2306
|
+
ptranspose(kernel_quarter);
|
|
2307
|
+
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
|
|
2308
|
+
}
|
|
1850
2309
|
}
|
|
1851
|
-
count +=
|
|
2310
|
+
count += psize*pack;
|
|
1852
2311
|
}
|
|
1853
2312
|
}
|
|
2313
|
+
|
|
1854
2314
|
for(; k<depth; k++)
|
|
1855
2315
|
{
|
|
1856
2316
|
Index w=0;
|
|
@@ -1873,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
|
|
|
1873
2333
|
if(PanelMode) count += pack * (stride-offset-depth);
|
|
1874
2334
|
}
|
|
1875
2335
|
|
|
1876
|
-
pack -=
|
|
1877
|
-
|
|
1878
|
-
|
|
2336
|
+
pack -= psize;
|
|
2337
|
+
Index left = rows - i;
|
|
2338
|
+
if (pack <= 0) {
|
|
2339
|
+
if (!gone_last &&
|
|
2340
|
+
(starting_pos == i || left >= psize/2 || left >= psize/4) &&
|
|
2341
|
+
((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
|
|
2342
|
+
(psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
|
|
2343
|
+
psize /= 2;
|
|
2344
|
+
pack = psize;
|
|
2345
|
+
continue;
|
|
2346
|
+
}
|
|
2347
|
+
// Pack2 may be *smaller* than PacketSize—that happens for
|
|
2348
|
+
// products like real * complex, where we have to go half the
|
|
2349
|
+
// progress on the lhs in order to duplicate those operands to
|
|
2350
|
+
// address both real & imaginary parts on the rhs. This portion will
|
|
2351
|
+
// pack those half ones until they match the number expected on the
|
|
2352
|
+
// last peeling loop at this point (for the rhs).
|
|
2353
|
+
if (Pack2 < PacketSize && !gone_last) {
|
|
2354
|
+
gone_last = true;
|
|
2355
|
+
psize = pack = left & ~1;
|
|
2356
|
+
}
|
|
2357
|
+
}
|
|
1879
2358
|
}
|
|
1880
2359
|
|
|
1881
2360
|
for(; i<rows; i++)
|
|
@@ -1931,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
|
|
|
1931
2410
|
// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
|
|
1932
2411
|
// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
|
|
1933
2412
|
// Index k=0;
|
|
1934
|
-
// if(PacketSize==8) // TODO
|
|
2413
|
+
// if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
|
|
1935
2414
|
// {
|
|
1936
2415
|
// for(; k<peeled_k; k+=PacketSize) {
|
|
1937
2416
|
// PacketBlock<Packet> kernel;
|
|
@@ -1978,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
|
|
|
1978
2457
|
{
|
|
1979
2458
|
for(; k<peeled_k; k+=PacketSize) {
|
|
1980
2459
|
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
|
|
1981
|
-
kernel.packet[0] = dm0.loadPacket(k);
|
|
1982
|
-
kernel.packet[1%PacketSize] = dm1.loadPacket(k);
|
|
1983
|
-
kernel.packet[2%PacketSize] = dm2.loadPacket(k);
|
|
1984
|
-
kernel.packet[3%PacketSize] = dm3.loadPacket(k);
|
|
2460
|
+
kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
|
|
2461
|
+
kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2462
|
+
kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2463
|
+
kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
1985
2464
|
ptranspose(kernel);
|
|
1986
2465
|
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
|
|
1987
2466
|
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
|
|
@@ -2022,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
|
|
|
2022
2501
|
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
|
2023
2502
|
{
|
|
2024
2503
|
typedef typename packet_traits<Scalar>::type Packet;
|
|
2504
|
+
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2505
|
+
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2025
2506
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
2026
|
-
enum { PacketSize = packet_traits<Scalar>::size
|
|
2027
|
-
|
|
2028
|
-
};
|
|
2029
|
-
|
|
2030
|
-
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2031
|
-
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
|
2032
|
-
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
|
2033
|
-
{
|
|
2034
|
-
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
|
2035
|
-
EIGEN_UNUSED_VARIABLE(stride);
|
|
2036
|
-
EIGEN_UNUSED_VARIABLE(offset);
|
|
2037
|
-
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2038
|
-
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2039
|
-
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
|
2040
|
-
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
2041
|
-
Index count = 0;
|
|
2042
|
-
|
|
2043
|
-
// if(nr>=8)
|
|
2044
|
-
// {
|
|
2045
|
-
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
|
2046
|
-
// {
|
|
2047
|
-
// // skip what we have before
|
|
2048
|
-
// if(PanelMode) count += 8 * offset;
|
|
2049
|
-
// for(Index k=0; k<depth; k++)
|
|
2050
|
-
// {
|
|
2051
|
-
// if (PacketSize==8) {
|
|
2052
|
-
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2053
|
-
// pstoreu(blockB+count, cj.pconj(A));
|
|
2054
|
-
// } else if (PacketSize==4) {
|
|
2055
|
-
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2056
|
-
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
|
|
2057
|
-
// pstoreu(blockB+count, cj.pconj(A));
|
|
2058
|
-
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
|
|
2059
|
-
// } else {
|
|
2060
|
-
// const Scalar* b0 = &rhs[k*rhsStride + j2];
|
|
2061
|
-
// blockB[count+0] = cj(b0[0]);
|
|
2062
|
-
// blockB[count+1] = cj(b0[1]);
|
|
2063
|
-
// blockB[count+2] = cj(b0[2]);
|
|
2064
|
-
// blockB[count+3] = cj(b0[3]);
|
|
2065
|
-
// blockB[count+4] = cj(b0[4]);
|
|
2066
|
-
// blockB[count+5] = cj(b0[5]);
|
|
2067
|
-
// blockB[count+6] = cj(b0[6]);
|
|
2068
|
-
// blockB[count+7] = cj(b0[7]);
|
|
2069
|
-
// }
|
|
2070
|
-
// count += 8;
|
|
2071
|
-
// }
|
|
2072
|
-
// // skip what we have after
|
|
2073
|
-
// if(PanelMode) count += 8 * (stride-offset-depth);
|
|
2074
|
-
// }
|
|
2075
|
-
// }
|
|
2076
|
-
if(nr>=4)
|
|
2507
|
+
enum { PacketSize = packet_traits<Scalar>::size,
|
|
2508
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2509
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
|
|
2510
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
|
|
2077
2511
|
{
|
|
2078
|
-
|
|
2512
|
+
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
|
2513
|
+
EIGEN_UNUSED_VARIABLE(stride);
|
|
2514
|
+
EIGEN_UNUSED_VARIABLE(offset);
|
|
2515
|
+
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2516
|
+
const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
|
|
2517
|
+
const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
|
|
2518
|
+
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2519
|
+
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
|
2520
|
+
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
2521
|
+
Index count = 0;
|
|
2522
|
+
|
|
2523
|
+
// if(nr>=8)
|
|
2524
|
+
// {
|
|
2525
|
+
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
|
2526
|
+
// {
|
|
2527
|
+
// // skip what we have before
|
|
2528
|
+
// if(PanelMode) count += 8 * offset;
|
|
2529
|
+
// for(Index k=0; k<depth; k++)
|
|
2530
|
+
// {
|
|
2531
|
+
// if (PacketSize==8) {
|
|
2532
|
+
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2533
|
+
// pstoreu(blockB+count, cj.pconj(A));
|
|
2534
|
+
// } else if (PacketSize==4) {
|
|
2535
|
+
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2536
|
+
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
|
|
2537
|
+
// pstoreu(blockB+count, cj.pconj(A));
|
|
2538
|
+
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
|
|
2539
|
+
// } else {
|
|
2540
|
+
// const Scalar* b0 = &rhs[k*rhsStride + j2];
|
|
2541
|
+
// blockB[count+0] = cj(b0[0]);
|
|
2542
|
+
// blockB[count+1] = cj(b0[1]);
|
|
2543
|
+
// blockB[count+2] = cj(b0[2]);
|
|
2544
|
+
// blockB[count+3] = cj(b0[3]);
|
|
2545
|
+
// blockB[count+4] = cj(b0[4]);
|
|
2546
|
+
// blockB[count+5] = cj(b0[5]);
|
|
2547
|
+
// blockB[count+6] = cj(b0[6]);
|
|
2548
|
+
// blockB[count+7] = cj(b0[7]);
|
|
2549
|
+
// }
|
|
2550
|
+
// count += 8;
|
|
2551
|
+
// }
|
|
2552
|
+
// // skip what we have after
|
|
2553
|
+
// if(PanelMode) count += 8 * (stride-offset-depth);
|
|
2554
|
+
// }
|
|
2555
|
+
// }
|
|
2556
|
+
if(nr>=4)
|
|
2079
2557
|
{
|
|
2080
|
-
|
|
2081
|
-
if(PanelMode) count += 4 * offset;
|
|
2082
|
-
for(Index k=0; k<depth; k++)
|
|
2558
|
+
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
|
|
2083
2559
|
{
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2560
|
+
// skip what we have before
|
|
2561
|
+
if(PanelMode) count += 4 * offset;
|
|
2562
|
+
for(Index k=0; k<depth; k++)
|
|
2563
|
+
{
|
|
2564
|
+
if (PacketSize==4) {
|
|
2565
|
+
Packet A = rhs.template loadPacket<Packet>(k, j2);
|
|
2566
|
+
pstoreu(blockB+count, cj.pconj(A));
|
|
2567
|
+
count += PacketSize;
|
|
2568
|
+
} else if (HasHalf && HalfPacketSize==4) {
|
|
2569
|
+
HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
|
|
2570
|
+
pstoreu(blockB+count, cj.pconj(A));
|
|
2571
|
+
count += HalfPacketSize;
|
|
2572
|
+
} else if (HasQuarter && QuarterPacketSize==4) {
|
|
2573
|
+
QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
|
|
2574
|
+
pstoreu(blockB+count, cj.pconj(A));
|
|
2575
|
+
count += QuarterPacketSize;
|
|
2576
|
+
} else {
|
|
2577
|
+
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
|
|
2578
|
+
blockB[count+0] = cj(dm0(0));
|
|
2579
|
+
blockB[count+1] = cj(dm0(1));
|
|
2580
|
+
blockB[count+2] = cj(dm0(2));
|
|
2581
|
+
blockB[count+3] = cj(dm0(3));
|
|
2582
|
+
count += 4;
|
|
2583
|
+
}
|
|
2095
2584
|
}
|
|
2585
|
+
// skip what we have after
|
|
2586
|
+
if(PanelMode) count += 4 * (stride-offset-depth);
|
|
2096
2587
|
}
|
|
2097
|
-
// skip what we have after
|
|
2098
|
-
if(PanelMode) count += 4 * (stride-offset-depth);
|
|
2099
2588
|
}
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
for(Index j2=packet_cols4; j2<cols; ++j2)
|
|
2103
|
-
{
|
|
2104
|
-
if(PanelMode) count += offset;
|
|
2105
|
-
for(Index k=0; k<depth; k++)
|
|
2589
|
+
// copy the remaining columns one at a time (nr==1)
|
|
2590
|
+
for(Index j2=packet_cols4; j2<cols; ++j2)
|
|
2106
2591
|
{
|
|
2107
|
-
|
|
2108
|
-
|
|
2592
|
+
if(PanelMode) count += offset;
|
|
2593
|
+
for(Index k=0; k<depth; k++)
|
|
2594
|
+
{
|
|
2595
|
+
blockB[count] = cj(rhs(k, j2));
|
|
2596
|
+
count += 1;
|
|
2597
|
+
}
|
|
2598
|
+
if(PanelMode) count += stride-offset-depth;
|
|
2109
2599
|
}
|
|
2110
|
-
if(PanelMode) count += stride-offset-depth;
|
|
2111
2600
|
}
|
|
2112
|
-
}
|
|
2601
|
+
};
|
|
2113
2602
|
|
|
2114
2603
|
} // end namespace internal
|
|
2115
2604
|
|