@smake/eigen 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/COPYING.APACHE +203 -0
- package/eigen/COPYING.BSD +26 -0
- package/eigen/COPYING.GPL +674 -0
- package/eigen/COPYING.LGPL +502 -0
- package/eigen/COPYING.MINPACK +51 -0
- package/eigen/COPYING.MPL2 +373 -0
- package/eigen/COPYING.README +18 -0
- package/eigen/Eigen/Cholesky +0 -1
- package/eigen/Eigen/Core +108 -266
- package/eigen/Eigen/Eigenvalues +0 -1
- package/eigen/Eigen/Geometry +3 -6
- package/eigen/Eigen/Householder +0 -1
- package/eigen/Eigen/Jacobi +0 -1
- package/eigen/Eigen/KLUSupport +41 -0
- package/eigen/Eigen/LU +2 -5
- package/eigen/Eigen/OrderingMethods +0 -3
- package/eigen/Eigen/PaStiXSupport +1 -0
- package/eigen/Eigen/PardisoSupport +0 -0
- package/eigen/Eigen/QR +0 -1
- package/eigen/Eigen/QtAlignedMalloc +0 -1
- package/eigen/Eigen/SVD +0 -1
- package/eigen/Eigen/Sparse +0 -2
- package/eigen/Eigen/SparseCholesky +0 -8
- package/eigen/Eigen/SparseLU +4 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
- package/eigen/Eigen/src/Cholesky/LLT.h +39 -23
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
- package/eigen/Eigen/src/Core/Array.h +99 -11
- package/eigen/Eigen/src/Core/ArrayBase.h +1 -1
- package/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
- package/eigen/Eigen/src/Core/Assign.h +1 -1
- package/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
- package/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
- package/eigen/Eigen/src/Core/BandMatrix.h +16 -16
- package/eigen/Eigen/src/Core/Block.h +56 -60
- package/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
- package/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
- package/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
- package/eigen/Eigen/src/Core/CoreIterators.h +5 -0
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +12 -10
- package/eigen/Eigen/src/Core/DenseBase.h +128 -39
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
- package/eigen/Eigen/src/Core/DenseStorage.h +150 -68
- package/eigen/Eigen/src/Core/Diagonal.h +21 -23
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
- package/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
- package/eigen/Eigen/src/Core/Dot.h +10 -10
- package/eigen/Eigen/src/Core/EigenBase.h +10 -9
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
- package/eigen/Eigen/src/Core/Fuzzy.h +3 -3
- package/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
- package/eigen/Eigen/src/Core/GenericPacketMath.h +597 -147
- package/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
- package/eigen/Eigen/src/Core/IO.h +40 -7
- package/eigen/Eigen/src/Core/IndexedView.h +237 -0
- package/eigen/Eigen/src/Core/Inverse.h +9 -10
- package/eigen/Eigen/src/Core/Map.h +7 -7
- package/eigen/Eigen/src/Core/MapBase.h +5 -3
- package/eigen/Eigen/src/Core/MathFunctions.h +756 -120
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
- package/eigen/Eigen/src/Core/Matrix.h +131 -25
- package/eigen/Eigen/src/Core/MatrixBase.h +19 -2
- package/eigen/Eigen/src/Core/NestByValue.h +25 -50
- package/eigen/Eigen/src/Core/NoAlias.h +4 -3
- package/eigen/Eigen/src/Core/NumTraits.h +107 -20
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +3 -3
- package/eigen/Eigen/src/Core/PlainObjectBase.h +145 -54
- package/eigen/Eigen/src/Core/Product.h +30 -25
- package/eigen/Eigen/src/Core/ProductEvaluators.h +183 -142
- package/eigen/Eigen/src/Core/Random.h +37 -1
- package/eigen/Eigen/src/Core/Redux.h +180 -170
- package/eigen/Eigen/src/Core/Ref.h +118 -21
- package/eigen/Eigen/src/Core/Replicate.h +8 -8
- package/eigen/Eigen/src/Core/Reshaped.h +454 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
- package/eigen/Eigen/src/Core/Reverse.h +18 -12
- package/eigen/Eigen/src/Core/Select.h +8 -6
- package/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
- package/eigen/Eigen/src/Core/Solve.h +14 -14
- package/eigen/Eigen/src/Core/SolveTriangular.h +13 -13
- package/eigen/Eigen/src/Core/SolverBase.h +41 -3
- package/eigen/Eigen/src/Core/StableNorm.h +100 -70
- package/eigen/Eigen/src/Core/StlIterators.h +463 -0
- package/eigen/Eigen/src/Core/Stride.h +9 -4
- package/eigen/Eigen/src/Core/Swap.h +5 -4
- package/eigen/Eigen/src/Core/Transpose.h +86 -27
- package/eigen/Eigen/src/Core/Transpositions.h +26 -8
- package/eigen/Eigen/src/Core/TriangularMatrix.h +88 -72
- package/eigen/Eigen/src/Core/VectorBlock.h +5 -5
- package/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
- package/eigen/Eigen/src/Core/Visitor.h +137 -29
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +186 -213
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1250 -252
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
- package/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +354 -15
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1073 -585
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +29 -7
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +4 -4
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +1 -1
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
- package/eigen/Eigen/src/Core/products/Parallelizer.h +23 -9
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +23 -6
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +2 -2
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +3 -3
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +5 -3
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
- package/eigen/Eigen/src/Core/util/BlasUtil.h +208 -124
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
- package/eigen/Eigen/src/Core/util/Constants.h +25 -9
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +14 -2
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +28 -4
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
- package/eigen/Eigen/src/Core/util/Macros.h +661 -250
- package/eigen/Eigen/src/Core/util/Memory.h +222 -52
- package/eigen/Eigen/src/Core/util/Meta.h +349 -105
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +48 -30
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +10 -5
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +75 -42
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
- package/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
- package/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
- package/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
- package/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
- package/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
- package/eigen/Eigen/src/Geometry/Quaternion.h +52 -14
- package/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
- package/eigen/Eigen/src/Geometry/Scaling.h +22 -4
- package/eigen/Eigen/src/Geometry/Transform.h +86 -65
- package/eigen/Eigen/src/Geometry/Translation.h +6 -6
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
- package/eigen/Eigen/src/Householder/Householder.h +8 -4
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
- package/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
- package/eigen/Eigen/src/LU/Determinant.h +35 -19
- package/eigen/Eigen/src/LU/FullPivLU.h +29 -43
- package/eigen/Eigen/src/LU/InverseImpl.h +25 -8
- package/eigen/Eigen/src/LU/PartialPivLU.h +67 -57
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
- package/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +10 -9
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
- package/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
- package/eigen/Eigen/src/SVD/BDCSVD.h +137 -48
- package/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
- package/eigen/Eigen/src/SVD/SVDBase.h +82 -21
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +16 -8
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +11 -36
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +2 -2
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +124 -10
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
- package/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
- package/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU.h +160 -10
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
- package/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
- package/eigen/Eigen/src/StlSupport/StdDeque.h +2 -14
- package/eigen/Eigen/src/StlSupport/StdList.h +2 -2
- package/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
- package/eigen/Eigen/src/misc/lapacke.h +5 -4
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +27 -1
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
- package/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
- package/eigen/README.md +5 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -15,7 +15,13 @@ namespace Eigen {
|
|
|
15
15
|
|
|
16
16
|
namespace internal {
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
enum GEBPPacketSizeType {
|
|
19
|
+
GEBPPacketFull = 0,
|
|
20
|
+
GEBPPacketHalf,
|
|
21
|
+
GEBPPacketQuarter
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
|
|
19
25
|
class gebp_traits;
|
|
20
26
|
|
|
21
27
|
|
|
@@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
|
|
|
25
31
|
return a<=0 ? b : a;
|
|
26
32
|
}
|
|
27
33
|
|
|
34
|
+
#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
|
|
35
|
+
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
|
|
36
|
+
#else
|
|
37
|
+
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
|
|
38
|
+
#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
|
|
39
|
+
|
|
40
|
+
#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
|
|
41
|
+
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
|
|
42
|
+
#else
|
|
43
|
+
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
|
|
44
|
+
#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
|
|
45
|
+
|
|
46
|
+
#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
|
|
47
|
+
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
|
|
48
|
+
#else
|
|
49
|
+
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
|
|
50
|
+
#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
|
|
51
|
+
|
|
28
52
|
#if EIGEN_ARCH_i386_OR_x86_64
|
|
29
|
-
const std::ptrdiff_t defaultL1CacheSize = 32*1024;
|
|
30
|
-
const std::ptrdiff_t defaultL2CacheSize = 256*1024;
|
|
31
|
-
const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
|
|
53
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
|
|
54
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
|
|
55
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
|
|
56
|
+
#elif EIGEN_ARCH_PPC
|
|
57
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
|
|
58
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
|
|
59
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
|
|
32
60
|
#else
|
|
33
|
-
const std::ptrdiff_t defaultL1CacheSize = 16*1024;
|
|
34
|
-
const std::ptrdiff_t defaultL2CacheSize = 512*1024;
|
|
35
|
-
const std::ptrdiff_t defaultL3CacheSize = 512*1024;
|
|
61
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
|
|
62
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
|
|
63
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
|
|
36
64
|
#endif
|
|
37
65
|
|
|
66
|
+
#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
|
|
67
|
+
#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
|
|
68
|
+
#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
|
|
69
|
+
|
|
38
70
|
/** \internal */
|
|
39
71
|
struct CacheSizes {
|
|
40
72
|
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
|
|
@@ -50,7 +82,6 @@ struct CacheSizes {
|
|
|
50
82
|
std::ptrdiff_t m_l3;
|
|
51
83
|
};
|
|
52
84
|
|
|
53
|
-
|
|
54
85
|
/** \internal */
|
|
55
86
|
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
|
|
56
87
|
{
|
|
@@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
101
132
|
// at the register level. This small horizontal panel has to stay within L1 cache.
|
|
102
133
|
std::ptrdiff_t l1, l2, l3;
|
|
103
134
|
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
|
135
|
+
#ifdef EIGEN_VECTORIZE_AVX512
|
|
136
|
+
// We need to find a rationale for that, but without this adjustment,
|
|
137
|
+
// performance with AVX512 is pretty bad, like -20% slower.
|
|
138
|
+
// One reason is that with increasing packet-size, the blocking size k
|
|
139
|
+
// has to become pretty small if we want that 1 lhs panel fit within L1.
|
|
140
|
+
// For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
|
|
141
|
+
// k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
|
|
142
|
+
// This is quite small for a good reuse of the accumulation registers.
|
|
143
|
+
l1 *= 4;
|
|
144
|
+
#endif
|
|
104
145
|
|
|
105
146
|
if (num_threads > 1) {
|
|
106
147
|
typedef typename Traits::ResScalar ResScalar;
|
|
@@ -308,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
|
|
|
308
349
|
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
|
|
309
350
|
}
|
|
310
351
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
|
|
319
|
-
{
|
|
320
|
-
c = cj.pmadd(a,b,c);
|
|
321
|
-
}
|
|
322
|
-
};
|
|
323
|
-
|
|
324
|
-
template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
|
|
325
|
-
EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
|
|
326
|
-
{
|
|
327
|
-
t = b; t = cj.pmul(a,t); c = padd(c,t);
|
|
328
|
-
}
|
|
329
|
-
};
|
|
352
|
+
template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
|
|
353
|
+
struct RhsPanelHelper {
|
|
354
|
+
private:
|
|
355
|
+
static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
|
|
356
|
+
public:
|
|
357
|
+
typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
|
|
358
|
+
};
|
|
330
359
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
}
|
|
360
|
+
template <typename Packet>
|
|
361
|
+
struct QuadPacket
|
|
362
|
+
{
|
|
363
|
+
Packet B_0, B1, B2, B3;
|
|
364
|
+
const Packet& get(const FixedInt<0>&) const { return B_0; }
|
|
365
|
+
const Packet& get(const FixedInt<1>&) const { return B1; }
|
|
366
|
+
const Packet& get(const FixedInt<2>&) const { return B2; }
|
|
367
|
+
const Packet& get(const FixedInt<3>&) const { return B3; }
|
|
368
|
+
};
|
|
336
369
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
370
|
+
template <int N, typename T1, typename T2, typename T3>
|
|
371
|
+
struct packet_conditional { typedef T3 type; };
|
|
372
|
+
|
|
373
|
+
template <typename T1, typename T2, typename T3>
|
|
374
|
+
struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
|
|
375
|
+
|
|
376
|
+
template <typename T1, typename T2, typename T3>
|
|
377
|
+
struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
|
|
378
|
+
|
|
379
|
+
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
|
|
380
|
+
typedef typename packet_conditional<packet_size, \
|
|
381
|
+
typename packet_traits<name ## Scalar>::type, \
|
|
382
|
+
typename packet_traits<name ## Scalar>::half, \
|
|
383
|
+
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
|
|
384
|
+
prefix ## name ## Packet
|
|
385
|
+
|
|
386
|
+
#define PACKET_DECL_COND(name, packet_size) \
|
|
387
|
+
typedef typename packet_conditional<packet_size, \
|
|
388
|
+
typename packet_traits<name ## Scalar>::type, \
|
|
389
|
+
typename packet_traits<name ## Scalar>::half, \
|
|
390
|
+
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
|
|
391
|
+
name ## Packet
|
|
392
|
+
|
|
393
|
+
#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
|
|
394
|
+
typedef typename packet_conditional<packet_size, \
|
|
395
|
+
typename packet_traits<Scalar>::type, \
|
|
396
|
+
typename packet_traits<Scalar>::half, \
|
|
397
|
+
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
|
|
398
|
+
prefix ## ScalarPacket
|
|
399
|
+
|
|
400
|
+
#define PACKET_DECL_COND_SCALAR(packet_size) \
|
|
401
|
+
typedef typename packet_conditional<packet_size, \
|
|
402
|
+
typename packet_traits<Scalar>::type, \
|
|
403
|
+
typename packet_traits<Scalar>::half, \
|
|
404
|
+
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
|
|
405
|
+
ScalarPacket
|
|
340
406
|
|
|
341
407
|
/* Vectorization logic
|
|
342
408
|
* real*real: unpack rhs to constant packets, ...
|
|
@@ -348,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
|
|
|
348
414
|
* cplx*real : unpack rhs to constant packets, ...
|
|
349
415
|
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
|
|
350
416
|
*/
|
|
351
|
-
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
|
|
417
|
+
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
|
|
352
418
|
class gebp_traits
|
|
353
419
|
{
|
|
354
420
|
public:
|
|
@@ -356,13 +422,17 @@ public:
|
|
|
356
422
|
typedef _RhsScalar RhsScalar;
|
|
357
423
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
358
424
|
|
|
425
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
426
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
427
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
428
|
+
|
|
359
429
|
enum {
|
|
360
430
|
ConjLhs = _ConjLhs,
|
|
361
431
|
ConjRhs = _ConjRhs,
|
|
362
|
-
Vectorizable =
|
|
363
|
-
LhsPacketSize = Vectorizable ?
|
|
364
|
-
RhsPacketSize = Vectorizable ?
|
|
365
|
-
ResPacketSize = Vectorizable ?
|
|
432
|
+
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
|
|
433
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
434
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
435
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
366
436
|
|
|
367
437
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
368
438
|
|
|
@@ -371,10 +441,12 @@ public:
|
|
|
371
441
|
|
|
372
442
|
// register block size along the M direction (currently, this one cannot be modified)
|
|
373
443
|
default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
|
|
374
|
-
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
|
|
375
|
-
|
|
444
|
+
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
|
|
445
|
+
&& ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
|
|
446
|
+
// we assume 16 registers or more
|
|
376
447
|
// See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
|
|
377
448
|
// then using 3*LhsPacketSize triggers non-implemented paths in syrk.
|
|
449
|
+
// Bug 1515: MSVC prior to v19.14 yields to register spilling.
|
|
378
450
|
mr = Vectorizable ? 3*LhsPacketSize : default_mr,
|
|
379
451
|
#else
|
|
380
452
|
mr = default_mr,
|
|
@@ -384,37 +456,41 @@ public:
|
|
|
384
456
|
RhsProgress = 1
|
|
385
457
|
};
|
|
386
458
|
|
|
387
|
-
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
388
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
389
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
390
459
|
|
|
391
460
|
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
392
461
|
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
393
462
|
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
463
|
+
typedef LhsPacket LhsPacket4Packing;
|
|
394
464
|
|
|
465
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
395
466
|
typedef ResPacket AccPacket;
|
|
396
467
|
|
|
397
468
|
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
398
469
|
{
|
|
399
470
|
p = pset1<ResPacket>(ResScalar(0));
|
|
400
471
|
}
|
|
401
|
-
|
|
402
|
-
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
|
|
403
|
-
{
|
|
404
|
-
pbroadcast4(b, b0, b1, b2, b3);
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
|
408
|
-
// {
|
|
409
|
-
// pbroadcast2(b, b0, b1);
|
|
410
|
-
// }
|
|
411
|
-
|
|
472
|
+
|
|
412
473
|
template<typename RhsPacketType>
|
|
413
474
|
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
414
475
|
{
|
|
415
476
|
dest = pset1<RhsPacketType>(*b);
|
|
416
477
|
}
|
|
417
|
-
|
|
478
|
+
|
|
479
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
480
|
+
{
|
|
481
|
+
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
template<typename RhsPacketType>
|
|
485
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
486
|
+
{
|
|
487
|
+
loadRhs(b, dest);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
491
|
+
{
|
|
492
|
+
}
|
|
493
|
+
|
|
418
494
|
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
419
495
|
{
|
|
420
496
|
dest = ploadquad<RhsPacket>(b);
|
|
@@ -432,8 +508,8 @@ public:
|
|
|
432
508
|
dest = ploadu<LhsPacketType>(a);
|
|
433
509
|
}
|
|
434
510
|
|
|
435
|
-
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
436
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
|
|
511
|
+
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
512
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
|
|
437
513
|
{
|
|
438
514
|
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
|
|
439
515
|
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
|
|
@@ -448,6 +524,12 @@ public:
|
|
|
448
524
|
#endif
|
|
449
525
|
}
|
|
450
526
|
|
|
527
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
528
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
529
|
+
{
|
|
530
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
531
|
+
}
|
|
532
|
+
|
|
451
533
|
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
|
452
534
|
{
|
|
453
535
|
r = pmadd(c,alpha,r);
|
|
@@ -461,21 +543,25 @@ public:
|
|
|
461
543
|
|
|
462
544
|
};
|
|
463
545
|
|
|
464
|
-
template<typename RealScalar, bool _ConjLhs>
|
|
465
|
-
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
|
|
546
|
+
template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
|
|
547
|
+
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
|
|
466
548
|
{
|
|
467
549
|
public:
|
|
468
550
|
typedef std::complex<RealScalar> LhsScalar;
|
|
469
551
|
typedef RealScalar RhsScalar;
|
|
470
552
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
471
553
|
|
|
554
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
555
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
556
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
557
|
+
|
|
472
558
|
enum {
|
|
473
559
|
ConjLhs = _ConjLhs,
|
|
474
560
|
ConjRhs = false,
|
|
475
|
-
Vectorizable =
|
|
476
|
-
LhsPacketSize = Vectorizable ?
|
|
477
|
-
RhsPacketSize = Vectorizable ?
|
|
478
|
-
ResPacketSize = Vectorizable ?
|
|
561
|
+
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
|
|
562
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
563
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
564
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
479
565
|
|
|
480
566
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
481
567
|
nr = 4,
|
|
@@ -490,13 +576,12 @@ public:
|
|
|
490
576
|
RhsProgress = 1
|
|
491
577
|
};
|
|
492
578
|
|
|
493
|
-
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
494
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
495
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
496
|
-
|
|
497
579
|
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
498
580
|
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
499
581
|
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
582
|
+
typedef LhsPacket LhsPacket4Packing;
|
|
583
|
+
|
|
584
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
500
585
|
|
|
501
586
|
typedef ResPacket AccPacket;
|
|
502
587
|
|
|
@@ -505,42 +590,64 @@ public:
|
|
|
505
590
|
p = pset1<ResPacket>(ResScalar(0));
|
|
506
591
|
}
|
|
507
592
|
|
|
508
|
-
|
|
593
|
+
template<typename RhsPacketType>
|
|
594
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
509
595
|
{
|
|
510
|
-
dest = pset1<
|
|
596
|
+
dest = pset1<RhsPacketType>(*b);
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
600
|
+
{
|
|
601
|
+
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
511
602
|
}
|
|
603
|
+
|
|
604
|
+
template<typename RhsPacketType>
|
|
605
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
606
|
+
{
|
|
607
|
+
loadRhs(b, dest);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
611
|
+
{}
|
|
512
612
|
|
|
513
613
|
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
514
614
|
{
|
|
515
|
-
dest
|
|
615
|
+
loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
|
|
516
616
|
}
|
|
517
617
|
|
|
518
|
-
EIGEN_STRONG_INLINE void
|
|
618
|
+
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
|
|
519
619
|
{
|
|
520
|
-
|
|
620
|
+
// FIXME we can do better!
|
|
621
|
+
// what we want here is a ploadheight
|
|
622
|
+
RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
|
|
623
|
+
dest = ploadquad<RhsPacket>(tmp);
|
|
521
624
|
}
|
|
522
625
|
|
|
523
|
-
EIGEN_STRONG_INLINE void
|
|
626
|
+
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
|
|
524
627
|
{
|
|
525
|
-
|
|
628
|
+
eigen_internal_assert(RhsPacketSize<=8);
|
|
629
|
+
dest = pset1<RhsPacket>(*b);
|
|
526
630
|
}
|
|
527
631
|
|
|
528
|
-
EIGEN_STRONG_INLINE void
|
|
632
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
529
633
|
{
|
|
530
|
-
|
|
634
|
+
dest = pload<LhsPacket>(a);
|
|
531
635
|
}
|
|
532
|
-
|
|
533
|
-
// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
|
|
534
|
-
// {
|
|
535
|
-
// pbroadcast2(b, b0, b1);
|
|
536
|
-
// }
|
|
537
636
|
|
|
538
|
-
|
|
637
|
+
template<typename LhsPacketType>
|
|
638
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
639
|
+
{
|
|
640
|
+
dest = ploadu<LhsPacketType>(a);
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
644
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
|
|
539
645
|
{
|
|
540
646
|
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
|
|
541
647
|
}
|
|
542
648
|
|
|
543
|
-
|
|
649
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
650
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
|
|
544
651
|
{
|
|
545
652
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
546
653
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
@@ -555,13 +662,20 @@ public:
|
|
|
555
662
|
c += a * b;
|
|
556
663
|
}
|
|
557
664
|
|
|
558
|
-
|
|
665
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
666
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
559
667
|
{
|
|
668
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
template <typename ResPacketType, typename AccPacketType>
|
|
672
|
+
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
673
|
+
{
|
|
674
|
+
conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
|
|
560
675
|
r = cj.pmadd(c,alpha,r);
|
|
561
676
|
}
|
|
562
677
|
|
|
563
678
|
protected:
|
|
564
|
-
conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
|
|
565
679
|
};
|
|
566
680
|
|
|
567
681
|
template<typename Packet>
|
|
@@ -580,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
|
|
|
580
694
|
return res;
|
|
581
695
|
}
|
|
582
696
|
|
|
697
|
+
// note that for DoublePacket<RealPacket> the "4" in "downto4"
|
|
698
|
+
// corresponds to the number of complexes, so it means "8"
|
|
699
|
+
// it terms of real coefficients.
|
|
700
|
+
|
|
583
701
|
template<typename Packet>
|
|
584
|
-
const DoublePacket<Packet>&
|
|
702
|
+
const DoublePacket<Packet>&
|
|
703
|
+
predux_half_dowto4(const DoublePacket<Packet> &a,
|
|
704
|
+
typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
|
|
585
705
|
{
|
|
586
706
|
return a;
|
|
587
707
|
}
|
|
588
708
|
|
|
589
|
-
template<typename Packet>
|
|
709
|
+
template<typename Packet>
|
|
710
|
+
DoublePacket<typename unpacket_traits<Packet>::half>
|
|
711
|
+
predux_half_dowto4(const DoublePacket<Packet> &a,
|
|
712
|
+
typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
|
|
713
|
+
{
|
|
714
|
+
// yes, that's pretty hackish :(
|
|
715
|
+
DoublePacket<typename unpacket_traits<Packet>::half> res;
|
|
716
|
+
typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
|
|
717
|
+
typedef typename packet_traits<Cplx>::type CplxPacket;
|
|
718
|
+
res.first = predux_half_dowto4(CplxPacket(a.first)).v;
|
|
719
|
+
res.second = predux_half_dowto4(CplxPacket(a.second)).v;
|
|
720
|
+
return res;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
// same here, "quad" actually means "8" in terms of real coefficients
|
|
724
|
+
template<typename Scalar, typename RealPacket>
|
|
725
|
+
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
|
726
|
+
typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
|
|
727
|
+
{
|
|
728
|
+
dest.first = pset1<RealPacket>(numext::real(*b));
|
|
729
|
+
dest.second = pset1<RealPacket>(numext::imag(*b));
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
template<typename Scalar, typename RealPacket>
|
|
733
|
+
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
|
734
|
+
typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
|
|
735
|
+
{
|
|
736
|
+
// yes, that's pretty hackish too :(
|
|
737
|
+
typedef typename NumTraits<Scalar>::Real RealScalar;
|
|
738
|
+
RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
|
|
739
|
+
RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
|
|
740
|
+
dest.first = ploadquad<RealPacket>(r);
|
|
741
|
+
dest.second = ploadquad<RealPacket>(i);
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
|
|
746
|
+
typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
|
|
747
|
+
};
|
|
590
748
|
// template<typename Packet>
|
|
591
749
|
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
|
|
592
750
|
// {
|
|
@@ -596,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
|
|
|
596
754
|
// return res;
|
|
597
755
|
// }
|
|
598
756
|
|
|
599
|
-
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
|
|
600
|
-
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
|
|
757
|
+
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
|
|
758
|
+
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
|
|
601
759
|
{
|
|
602
760
|
public:
|
|
603
761
|
typedef std::complex<RealScalar> Scalar;
|
|
@@ -605,15 +763,21 @@ public:
|
|
|
605
763
|
typedef std::complex<RealScalar> RhsScalar;
|
|
606
764
|
typedef std::complex<RealScalar> ResScalar;
|
|
607
765
|
|
|
766
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
767
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
768
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
769
|
+
PACKET_DECL_COND(Real, _PacketSize);
|
|
770
|
+
PACKET_DECL_COND_SCALAR(_PacketSize);
|
|
771
|
+
|
|
608
772
|
enum {
|
|
609
773
|
ConjLhs = _ConjLhs,
|
|
610
774
|
ConjRhs = _ConjRhs,
|
|
611
|
-
Vectorizable =
|
|
612
|
-
&&
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
775
|
+
Vectorizable = unpacket_traits<RealPacket>::vectorizable
|
|
776
|
+
&& unpacket_traits<ScalarPacket>::vectorizable,
|
|
777
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
778
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
779
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
|
|
780
|
+
RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
|
|
617
781
|
|
|
618
782
|
// FIXME: should depend on NumberOfRegisters
|
|
619
783
|
nr = 4,
|
|
@@ -623,14 +787,16 @@ public:
|
|
|
623
787
|
RhsProgress = 1
|
|
624
788
|
};
|
|
625
789
|
|
|
626
|
-
typedef
|
|
627
|
-
typedef typename packet_traits<Scalar>::type ScalarPacket;
|
|
628
|
-
typedef DoublePacket<RealPacket> DoublePacketType;
|
|
790
|
+
typedef DoublePacket<RealPacket> DoublePacketType;
|
|
629
791
|
|
|
792
|
+
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
|
|
630
793
|
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
|
|
631
794
|
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
|
|
632
795
|
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
|
|
633
796
|
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
|
|
797
|
+
|
|
798
|
+
// this actualy holds 8 packets!
|
|
799
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
634
800
|
|
|
635
801
|
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
|
|
636
802
|
|
|
@@ -641,51 +807,49 @@ public:
|
|
|
641
807
|
}
|
|
642
808
|
|
|
643
809
|
// Scalar path
|
|
644
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b,
|
|
810
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
|
|
645
811
|
{
|
|
646
|
-
dest = pset1<
|
|
812
|
+
dest = pset1<ScalarPacket>(*b);
|
|
647
813
|
}
|
|
648
814
|
|
|
649
815
|
// Vectorized path
|
|
650
|
-
|
|
816
|
+
template<typename RealPacketType>
|
|
817
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
|
|
651
818
|
{
|
|
652
|
-
dest.first = pset1<
|
|
653
|
-
dest.second = pset1<
|
|
819
|
+
dest.first = pset1<RealPacketType>(numext::real(*b));
|
|
820
|
+
dest.second = pset1<RealPacketType>(numext::imag(*b));
|
|
654
821
|
}
|
|
655
|
-
|
|
656
|
-
EIGEN_STRONG_INLINE void
|
|
822
|
+
|
|
823
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
657
824
|
{
|
|
658
|
-
loadRhs(b,dest);
|
|
825
|
+
loadRhs(b, dest.B_0);
|
|
826
|
+
loadRhs(b + 1, dest.B1);
|
|
827
|
+
loadRhs(b + 2, dest.B2);
|
|
828
|
+
loadRhs(b + 3, dest.B3);
|
|
659
829
|
}
|
|
660
|
-
|
|
830
|
+
|
|
831
|
+
// Scalar path
|
|
832
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
|
|
661
833
|
{
|
|
662
|
-
|
|
663
|
-
loadRhs(b,dest);
|
|
834
|
+
loadRhs(b, dest);
|
|
664
835
|
}
|
|
665
|
-
|
|
666
|
-
|
|
836
|
+
|
|
837
|
+
// Vectorized path
|
|
838
|
+
template<typename RealPacketType>
|
|
839
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
|
|
667
840
|
{
|
|
668
|
-
|
|
669
|
-
loadRhs(b+0, b0);
|
|
670
|
-
loadRhs(b+1, b1);
|
|
671
|
-
loadRhs(b+2, b2);
|
|
672
|
-
loadRhs(b+3, b3);
|
|
841
|
+
loadRhs(b, dest);
|
|
673
842
|
}
|
|
843
|
+
|
|
844
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
|
|
674
845
|
|
|
675
|
-
|
|
676
|
-
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
|
|
846
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
|
|
677
847
|
{
|
|
678
|
-
|
|
679
|
-
loadRhs(b+0, b0);
|
|
680
|
-
loadRhs(b+1, b1);
|
|
848
|
+
loadRhs(b,dest);
|
|
681
849
|
}
|
|
682
|
-
|
|
683
|
-
// Scalar path
|
|
684
|
-
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
|
|
850
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
|
|
685
851
|
{
|
|
686
|
-
|
|
687
|
-
loadRhs(b+0, b0);
|
|
688
|
-
loadRhs(b+1, b1);
|
|
852
|
+
loadQuadToDoublePacket(b,dest);
|
|
689
853
|
}
|
|
690
854
|
|
|
691
855
|
// nothing special here
|
|
@@ -694,47 +858,59 @@ public:
|
|
|
694
858
|
dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
|
|
695
859
|
}
|
|
696
860
|
|
|
697
|
-
|
|
861
|
+
template<typename LhsPacketType>
|
|
862
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
698
863
|
{
|
|
699
|
-
dest = ploadu<
|
|
864
|
+
dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
|
|
700
865
|
}
|
|
701
866
|
|
|
702
|
-
|
|
867
|
+
template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
|
|
868
|
+
EIGEN_STRONG_INLINE
|
|
869
|
+
typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
|
|
870
|
+
madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
|
|
703
871
|
{
|
|
704
872
|
c.first = padd(pmul(a,b.first), c.first);
|
|
705
873
|
c.second = padd(pmul(a,b.second),c.second);
|
|
706
874
|
}
|
|
707
875
|
|
|
708
|
-
|
|
876
|
+
template<typename LaneIdType>
|
|
877
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
|
|
709
878
|
{
|
|
710
879
|
c = cj.pmadd(a,b,c);
|
|
711
880
|
}
|
|
881
|
+
|
|
882
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
883
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
884
|
+
{
|
|
885
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
886
|
+
}
|
|
712
887
|
|
|
713
888
|
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
|
|
714
889
|
|
|
715
|
-
|
|
890
|
+
template<typename RealPacketType, typename ResPacketType>
|
|
891
|
+
EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
716
892
|
{
|
|
717
893
|
// assemble c
|
|
718
|
-
|
|
894
|
+
ResPacketType tmp;
|
|
719
895
|
if((!ConjLhs)&&(!ConjRhs))
|
|
720
896
|
{
|
|
721
|
-
tmp = pcplxflip(pconj(
|
|
722
|
-
tmp = padd(
|
|
897
|
+
tmp = pcplxflip(pconj(ResPacketType(c.second)));
|
|
898
|
+
tmp = padd(ResPacketType(c.first),tmp);
|
|
723
899
|
}
|
|
724
900
|
else if((!ConjLhs)&&(ConjRhs))
|
|
725
901
|
{
|
|
726
|
-
tmp = pconj(pcplxflip(
|
|
727
|
-
tmp = padd(
|
|
902
|
+
tmp = pconj(pcplxflip(ResPacketType(c.second)));
|
|
903
|
+
tmp = padd(ResPacketType(c.first),tmp);
|
|
728
904
|
}
|
|
729
905
|
else if((ConjLhs)&&(!ConjRhs))
|
|
730
906
|
{
|
|
731
|
-
tmp = pcplxflip(
|
|
732
|
-
tmp = padd(pconj(
|
|
907
|
+
tmp = pcplxflip(ResPacketType(c.second));
|
|
908
|
+
tmp = padd(pconj(ResPacketType(c.first)),tmp);
|
|
733
909
|
}
|
|
734
910
|
else if((ConjLhs)&&(ConjRhs))
|
|
735
911
|
{
|
|
736
|
-
tmp = pcplxflip(
|
|
737
|
-
tmp = psub(pconj(
|
|
912
|
+
tmp = pcplxflip(ResPacketType(c.second));
|
|
913
|
+
tmp = psub(pconj(ResPacketType(c.first)),tmp);
|
|
738
914
|
}
|
|
739
915
|
|
|
740
916
|
r = pmadd(tmp,alpha,r);
|
|
@@ -744,8 +920,8 @@ protected:
|
|
|
744
920
|
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
|
745
921
|
};
|
|
746
922
|
|
|
747
|
-
template<typename RealScalar, bool _ConjRhs>
|
|
748
|
-
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
|
|
923
|
+
template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
|
|
924
|
+
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
|
|
749
925
|
{
|
|
750
926
|
public:
|
|
751
927
|
typedef std::complex<RealScalar> Scalar;
|
|
@@ -753,14 +929,25 @@ public:
|
|
|
753
929
|
typedef Scalar RhsScalar;
|
|
754
930
|
typedef Scalar ResScalar;
|
|
755
931
|
|
|
932
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
933
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
934
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
935
|
+
PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
|
|
936
|
+
PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
|
|
937
|
+
|
|
938
|
+
#undef PACKET_DECL_COND_SCALAR_PREFIX
|
|
939
|
+
#undef PACKET_DECL_COND_PREFIX
|
|
940
|
+
#undef PACKET_DECL_COND_SCALAR
|
|
941
|
+
#undef PACKET_DECL_COND
|
|
942
|
+
|
|
756
943
|
enum {
|
|
757
944
|
ConjLhs = false,
|
|
758
945
|
ConjRhs = _ConjRhs,
|
|
759
|
-
Vectorizable =
|
|
760
|
-
&&
|
|
761
|
-
LhsPacketSize = Vectorizable ?
|
|
762
|
-
RhsPacketSize = Vectorizable ?
|
|
763
|
-
ResPacketSize = Vectorizable ?
|
|
946
|
+
Vectorizable = unpacket_traits<_RealPacket>::vectorizable
|
|
947
|
+
&& unpacket_traits<_ScalarPacket>::vectorizable,
|
|
948
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
949
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
950
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
|
|
764
951
|
|
|
765
952
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
766
953
|
// FIXME: should depend on NumberOfRegisters
|
|
@@ -771,14 +958,11 @@ public:
|
|
|
771
958
|
RhsProgress = 1
|
|
772
959
|
};
|
|
773
960
|
|
|
774
|
-
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
|
775
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
776
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
777
|
-
|
|
778
961
|
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
779
962
|
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
780
963
|
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
781
|
-
|
|
964
|
+
typedef LhsPacket LhsPacket4Packing;
|
|
965
|
+
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
782
966
|
typedef ResPacket AccPacket;
|
|
783
967
|
|
|
784
968
|
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
@@ -786,22 +970,25 @@ public:
|
|
|
786
970
|
p = pset1<ResPacket>(ResScalar(0));
|
|
787
971
|
}
|
|
788
972
|
|
|
789
|
-
|
|
973
|
+
template<typename RhsPacketType>
|
|
974
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
790
975
|
{
|
|
791
|
-
dest = pset1<
|
|
976
|
+
dest = pset1<RhsPacketType>(*b);
|
|
792
977
|
}
|
|
793
|
-
|
|
794
|
-
void
|
|
978
|
+
|
|
979
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
795
980
|
{
|
|
796
|
-
pbroadcast4(b,
|
|
981
|
+
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
797
982
|
}
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
983
|
+
|
|
984
|
+
template<typename RhsPacketType>
|
|
985
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
986
|
+
{
|
|
987
|
+
loadRhs(b, dest);
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
991
|
+
{}
|
|
805
992
|
|
|
806
993
|
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
807
994
|
{
|
|
@@ -810,21 +997,23 @@ public:
|
|
|
810
997
|
|
|
811
998
|
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
812
999
|
{
|
|
813
|
-
|
|
814
|
-
loadRhs(b,dest);
|
|
1000
|
+
dest = ploadquad<RhsPacket>(b);
|
|
815
1001
|
}
|
|
816
1002
|
|
|
817
|
-
|
|
1003
|
+
template<typename LhsPacketType>
|
|
1004
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
818
1005
|
{
|
|
819
|
-
dest = ploaddup<
|
|
1006
|
+
dest = ploaddup<LhsPacketType>(a);
|
|
820
1007
|
}
|
|
821
1008
|
|
|
822
|
-
|
|
1009
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
1010
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
|
|
823
1011
|
{
|
|
824
1012
|
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
|
|
825
1013
|
}
|
|
826
1014
|
|
|
827
|
-
|
|
1015
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
1016
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
|
|
828
1017
|
{
|
|
829
1018
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
830
1019
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
@@ -840,16 +1029,24 @@ public:
|
|
|
840
1029
|
c += a * b;
|
|
841
1030
|
}
|
|
842
1031
|
|
|
843
|
-
|
|
1032
|
+
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
1033
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
|
|
1034
|
+
{
|
|
1035
|
+
madd(a, b.get(lane), c, tmp, lane);
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
template <typename ResPacketType, typename AccPacketType>
|
|
1039
|
+
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
844
1040
|
{
|
|
1041
|
+
conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
|
|
845
1042
|
r = cj.pmadd(alpha,c,r);
|
|
846
1043
|
}
|
|
847
1044
|
|
|
848
1045
|
protected:
|
|
849
|
-
|
|
1046
|
+
|
|
850
1047
|
};
|
|
851
1048
|
|
|
852
|
-
/* optimized
|
|
1049
|
+
/* optimized General packed Block * packed Panel product kernel
|
|
853
1050
|
*
|
|
854
1051
|
* Mixing type logic: C += A * B
|
|
855
1052
|
* | A | B | comments
|
|
@@ -859,26 +1056,47 @@ protected:
|
|
|
859
1056
|
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
|
860
1057
|
struct gebp_kernel
|
|
861
1058
|
{
|
|
862
|
-
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
|
|
1059
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1060
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
|
|
1061
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
|
|
1062
|
+
|
|
863
1063
|
typedef typename Traits::ResScalar ResScalar;
|
|
864
1064
|
typedef typename Traits::LhsPacket LhsPacket;
|
|
865
1065
|
typedef typename Traits::RhsPacket RhsPacket;
|
|
866
1066
|
typedef typename Traits::ResPacket ResPacket;
|
|
867
1067
|
typedef typename Traits::AccPacket AccPacket;
|
|
1068
|
+
typedef typename Traits::RhsPacketx4 RhsPacketx4;
|
|
1069
|
+
|
|
1070
|
+
typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
|
|
1071
|
+
|
|
1072
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
868
1073
|
|
|
869
|
-
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
|
|
870
1074
|
typedef typename SwappedTraits::ResScalar SResScalar;
|
|
871
1075
|
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
872
1076
|
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
|
873
1077
|
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
874
1078
|
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
875
1079
|
|
|
1080
|
+
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
|
|
1081
|
+
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
|
|
1082
|
+
typedef typename HalfTraits::ResPacket ResPacketHalf;
|
|
1083
|
+
typedef typename HalfTraits::AccPacket AccPacketHalf;
|
|
1084
|
+
|
|
1085
|
+
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
|
|
1086
|
+
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
1087
|
+
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
1088
|
+
typedef typename QuarterTraits::AccPacket AccPacketQuarter;
|
|
1089
|
+
|
|
876
1090
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
877
1091
|
|
|
878
1092
|
enum {
|
|
879
1093
|
Vectorizable = Traits::Vectorizable,
|
|
880
1094
|
LhsProgress = Traits::LhsProgress,
|
|
1095
|
+
LhsProgressHalf = HalfTraits::LhsProgress,
|
|
1096
|
+
LhsProgressQuarter = QuarterTraits::LhsProgress,
|
|
881
1097
|
RhsProgress = Traits::RhsProgress,
|
|
1098
|
+
RhsProgressHalf = HalfTraits::RhsProgress,
|
|
1099
|
+
RhsProgressQuarter = QuarterTraits::RhsProgress,
|
|
882
1100
|
ResPacketSize = Traits::ResPacketSize
|
|
883
1101
|
};
|
|
884
1102
|
|
|
@@ -888,6 +1106,299 @@ struct gebp_kernel
|
|
|
888
1106
|
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
|
|
889
1107
|
};
|
|
890
1108
|
|
|
1109
|
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
|
|
1110
|
+
int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
|
|
1111
|
+
struct last_row_process_16_packets
|
|
1112
|
+
{
|
|
1113
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1114
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
1115
|
+
|
|
1116
|
+
typedef typename Traits::ResScalar ResScalar;
|
|
1117
|
+
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
1118
|
+
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
|
1119
|
+
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
1120
|
+
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
1121
|
+
|
|
1122
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
|
|
1123
|
+
const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
|
|
1124
|
+
ResScalar alpha, SAccPacket &C0)
|
|
1125
|
+
{
|
|
1126
|
+
EIGEN_UNUSED_VARIABLE(res);
|
|
1127
|
+
EIGEN_UNUSED_VARIABLE(straits);
|
|
1128
|
+
EIGEN_UNUSED_VARIABLE(blA);
|
|
1129
|
+
EIGEN_UNUSED_VARIABLE(blB);
|
|
1130
|
+
EIGEN_UNUSED_VARIABLE(depth);
|
|
1131
|
+
EIGEN_UNUSED_VARIABLE(endk);
|
|
1132
|
+
EIGEN_UNUSED_VARIABLE(i);
|
|
1133
|
+
EIGEN_UNUSED_VARIABLE(j2);
|
|
1134
|
+
EIGEN_UNUSED_VARIABLE(alpha);
|
|
1135
|
+
EIGEN_UNUSED_VARIABLE(C0);
|
|
1136
|
+
}
|
|
1137
|
+
};
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
|
1141
|
+
struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
|
|
1142
|
+
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1143
|
+
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
1144
|
+
|
|
1145
|
+
typedef typename Traits::ResScalar ResScalar;
|
|
1146
|
+
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
1147
|
+
typedef typename SwappedTraits::RhsPacket SRhsPacket;
|
|
1148
|
+
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
1149
|
+
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
1150
|
+
|
|
1151
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
|
|
1152
|
+
const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
|
|
1153
|
+
ResScalar alpha, SAccPacket &C0)
|
|
1154
|
+
{
|
|
1155
|
+
typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
|
|
1156
|
+
typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
|
|
1157
|
+
typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
|
|
1158
|
+
typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
|
|
1159
|
+
|
|
1160
|
+
SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
|
|
1161
|
+
SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
|
|
1162
|
+
|
|
1163
|
+
if (depth - endk > 0)
|
|
1164
|
+
{
|
|
1165
|
+
// We have to handle the last row(s) of the rhs, which
|
|
1166
|
+
// correspond to a half-packet
|
|
1167
|
+
SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
|
|
1168
|
+
|
|
1169
|
+
for (Index kk = endk; kk < depth; kk++)
|
|
1170
|
+
{
|
|
1171
|
+
SLhsPacketQuarter a0;
|
|
1172
|
+
SRhsPacketQuarter b0;
|
|
1173
|
+
straits.loadLhsUnaligned(blB, a0);
|
|
1174
|
+
straits.loadRhs(blA, b0);
|
|
1175
|
+
straits.madd(a0,b0,c0,b0, fix<0>);
|
|
1176
|
+
blB += SwappedTraits::LhsProgress/4;
|
|
1177
|
+
blA += 1;
|
|
1178
|
+
}
|
|
1179
|
+
straits.acc(c0, alphav, R);
|
|
1180
|
+
}
|
|
1181
|
+
else
|
|
1182
|
+
{
|
|
1183
|
+
straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
|
|
1184
|
+
}
|
|
1185
|
+
res.scatterPacket(i, j2, R);
|
|
1186
|
+
}
|
|
1187
|
+
};
|
|
1188
|
+
|
|
1189
|
+
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
|
|
1190
|
+
struct lhs_process_one_packet
|
|
1191
|
+
{
|
|
1192
|
+
typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
|
|
1193
|
+
|
|
1194
|
+
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
|
|
1195
|
+
{
|
|
1196
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
|
|
1197
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
|
|
1198
|
+
traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
|
|
1199
|
+
traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
|
|
1200
|
+
traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
|
|
1201
|
+
traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
|
|
1202
|
+
traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
|
|
1203
|
+
traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
|
|
1204
|
+
#if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
|
|
1205
|
+
__asm__ ("" : "+x,m" (*A0));
|
|
1206
|
+
#endif
|
|
1207
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
EIGEN_STRONG_INLINE void operator()(
|
|
1211
|
+
const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
|
|
1212
|
+
Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
|
|
1213
|
+
int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
|
|
1214
|
+
{
|
|
1215
|
+
GEBPTraits traits;
|
|
1216
|
+
|
|
1217
|
+
// loops on each largest micro horizontal panel of lhs
|
|
1218
|
+
// (LhsProgress x depth)
|
|
1219
|
+
for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
|
|
1220
|
+
{
|
|
1221
|
+
// loops on each largest micro vertical panel of rhs (depth * nr)
|
|
1222
|
+
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
|
1223
|
+
{
|
|
1224
|
+
// We select a LhsProgress x nr micro block of res
|
|
1225
|
+
// which is entirely stored into 1 x nr registers.
|
|
1226
|
+
|
|
1227
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
|
|
1228
|
+
prefetch(&blA[0]);
|
|
1229
|
+
|
|
1230
|
+
// gets res block as register
|
|
1231
|
+
AccPacket C0, C1, C2, C3;
|
|
1232
|
+
traits.initAcc(C0);
|
|
1233
|
+
traits.initAcc(C1);
|
|
1234
|
+
traits.initAcc(C2);
|
|
1235
|
+
traits.initAcc(C3);
|
|
1236
|
+
// To improve instruction pipelining, let's double the accumulation registers:
|
|
1237
|
+
// even k will accumulate in C*, while odd k will accumulate in D*.
|
|
1238
|
+
// This trick is crutial to get good performance with FMA, otherwise it is
|
|
1239
|
+
// actually faster to perform separated MUL+ADD because of a naturally
|
|
1240
|
+
// better instruction-level parallelism.
|
|
1241
|
+
AccPacket D0, D1, D2, D3;
|
|
1242
|
+
traits.initAcc(D0);
|
|
1243
|
+
traits.initAcc(D1);
|
|
1244
|
+
traits.initAcc(D2);
|
|
1245
|
+
traits.initAcc(D3);
|
|
1246
|
+
|
|
1247
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1248
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1249
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1250
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1251
|
+
|
|
1252
|
+
r0.prefetch(prefetch_res_offset);
|
|
1253
|
+
r1.prefetch(prefetch_res_offset);
|
|
1254
|
+
r2.prefetch(prefetch_res_offset);
|
|
1255
|
+
r3.prefetch(prefetch_res_offset);
|
|
1256
|
+
|
|
1257
|
+
// performs "inner" products
|
|
1258
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
|
1259
|
+
prefetch(&blB[0]);
|
|
1260
|
+
LhsPacket A0, A1;
|
|
1261
|
+
|
|
1262
|
+
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1263
|
+
{
|
|
1264
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
|
|
1265
|
+
RhsPacketx4 rhs_panel;
|
|
1266
|
+
RhsPacket T0;
|
|
1267
|
+
|
|
1268
|
+
internal::prefetch(blB+(48+0));
|
|
1269
|
+
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1270
|
+
peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1271
|
+
peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1272
|
+
peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1273
|
+
internal::prefetch(blB+(48+16));
|
|
1274
|
+
peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1275
|
+
peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1276
|
+
peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1277
|
+
peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1278
|
+
|
|
1279
|
+
blB += pk*4*RhsProgress;
|
|
1280
|
+
blA += pk*LhsProgress;
|
|
1281
|
+
|
|
1282
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
|
|
1283
|
+
}
|
|
1284
|
+
C0 = padd(C0,D0);
|
|
1285
|
+
C1 = padd(C1,D1);
|
|
1286
|
+
C2 = padd(C2,D2);
|
|
1287
|
+
C3 = padd(C3,D3);
|
|
1288
|
+
|
|
1289
|
+
// process remaining peeled loop
|
|
1290
|
+
for(Index k=peeled_kc; k<depth; k++)
|
|
1291
|
+
{
|
|
1292
|
+
RhsPacketx4 rhs_panel;
|
|
1293
|
+
RhsPacket T0;
|
|
1294
|
+
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1295
|
+
blB += 4*RhsProgress;
|
|
1296
|
+
blA += LhsProgress;
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
ResPacket R0, R1;
|
|
1300
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1301
|
+
|
|
1302
|
+
R0 = r0.template loadPacket<ResPacket>(0);
|
|
1303
|
+
R1 = r1.template loadPacket<ResPacket>(0);
|
|
1304
|
+
traits.acc(C0, alphav, R0);
|
|
1305
|
+
traits.acc(C1, alphav, R1);
|
|
1306
|
+
r0.storePacket(0, R0);
|
|
1307
|
+
r1.storePacket(0, R1);
|
|
1308
|
+
|
|
1309
|
+
R0 = r2.template loadPacket<ResPacket>(0);
|
|
1310
|
+
R1 = r3.template loadPacket<ResPacket>(0);
|
|
1311
|
+
traits.acc(C2, alphav, R0);
|
|
1312
|
+
traits.acc(C3, alphav, R1);
|
|
1313
|
+
r2.storePacket(0, R0);
|
|
1314
|
+
r3.storePacket(0, R1);
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
// Deal with remaining columns of the rhs
|
|
1318
|
+
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1319
|
+
{
|
|
1320
|
+
// One column at a time
|
|
1321
|
+
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
|
|
1322
|
+
prefetch(&blA[0]);
|
|
1323
|
+
|
|
1324
|
+
// gets res block as register
|
|
1325
|
+
AccPacket C0;
|
|
1326
|
+
traits.initAcc(C0);
|
|
1327
|
+
|
|
1328
|
+
LinearMapper r0 = res.getLinearMapper(i, j2);
|
|
1329
|
+
|
|
1330
|
+
// performs "inner" products
|
|
1331
|
+
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
1332
|
+
LhsPacket A0;
|
|
1333
|
+
|
|
1334
|
+
for(Index k= 0; k<peeled_kc; k+=pk)
|
|
1335
|
+
{
|
|
1336
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
|
|
1337
|
+
RhsPacket B_0;
|
|
1338
|
+
|
|
1339
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1340
|
+
do { \
|
|
1341
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
|
|
1342
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1343
|
+
/* FIXME: why unaligned???? */ \
|
|
1344
|
+
traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
|
|
1345
|
+
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
|
1346
|
+
traits.madd(A0, B_0, C0, B_0, fix<0>); \
|
|
1347
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
|
|
1348
|
+
} while(false);
|
|
1349
|
+
|
|
1350
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
1351
|
+
EIGEN_GEBGP_ONESTEP(1);
|
|
1352
|
+
EIGEN_GEBGP_ONESTEP(2);
|
|
1353
|
+
EIGEN_GEBGP_ONESTEP(3);
|
|
1354
|
+
EIGEN_GEBGP_ONESTEP(4);
|
|
1355
|
+
EIGEN_GEBGP_ONESTEP(5);
|
|
1356
|
+
EIGEN_GEBGP_ONESTEP(6);
|
|
1357
|
+
EIGEN_GEBGP_ONESTEP(7);
|
|
1358
|
+
|
|
1359
|
+
blB += pk*RhsProgress;
|
|
1360
|
+
blA += pk*LhsProgress;
|
|
1361
|
+
|
|
1362
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
// process remaining peeled loop
|
|
1366
|
+
for(Index k=peeled_kc; k<depth; k++)
|
|
1367
|
+
{
|
|
1368
|
+
RhsPacket B_0;
|
|
1369
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
1370
|
+
blB += RhsProgress;
|
|
1371
|
+
blA += LhsProgress;
|
|
1372
|
+
}
|
|
1373
|
+
#undef EIGEN_GEBGP_ONESTEP
|
|
1374
|
+
ResPacket R0;
|
|
1375
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1376
|
+
R0 = r0.template loadPacket<ResPacket>(0);
|
|
1377
|
+
traits.acc(C0, alphav, R0);
|
|
1378
|
+
r0.storePacket(0, R0);
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
}
|
|
1382
|
+
};
|
|
1383
|
+
|
|
1384
|
+
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
|
|
1385
|
+
struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
|
|
1386
|
+
{
|
|
1387
|
+
|
|
1388
|
+
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
|
|
1389
|
+
{
|
|
1390
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
|
|
1391
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
|
|
1392
|
+
traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
|
|
1393
|
+
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
|
|
1394
|
+
traits.madd(*A0, *B_0, *C0, *B_0);
|
|
1395
|
+
traits.madd(*A0, *B1, *C1, *B1);
|
|
1396
|
+
traits.madd(*A0, *B2, *C2, *B2);
|
|
1397
|
+
traits.madd(*A0, *B3, *C3, *B3);
|
|
1398
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
|
|
1399
|
+
}
|
|
1400
|
+
};
|
|
1401
|
+
|
|
891
1402
|
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
|
892
1403
|
EIGEN_DONT_INLINE
|
|
893
1404
|
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
|
|
@@ -904,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
904
1415
|
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
905
1416
|
const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
|
|
906
1417
|
const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
|
|
907
|
-
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
|
|
1418
|
+
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
|
|
1419
|
+
const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
|
|
1420
|
+
const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
|
|
908
1421
|
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
|
|
909
1422
|
const Index peeled_kc = depth & ~(pk-1);
|
|
910
|
-
const
|
|
1423
|
+
const int prefetch_res_offset = 32/sizeof(ResScalar);
|
|
911
1424
|
// const Index depth2 = depth & ~1;
|
|
912
1425
|
|
|
913
1426
|
//---------- Process 3 * LhsProgress rows at once ----------
|
|
@@ -965,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
965
1478
|
for(Index k=0; k<peeled_kc; k+=pk)
|
|
966
1479
|
{
|
|
967
1480
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
|
|
968
|
-
|
|
1481
|
+
// 15 registers are taken (12 for acc, 2 for lhs).
|
|
1482
|
+
RhsPanel15 rhs_panel;
|
|
1483
|
+
RhsPacket T0;
|
|
969
1484
|
LhsPacket A2;
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
1485
|
+
#if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
|
|
1486
|
+
// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
|
|
1487
|
+
// without this workaround A0, A1, and A2 are loaded in the same register,
|
|
1488
|
+
// which is not good for pipelining
|
|
1489
|
+
#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
|
|
1490
|
+
#else
|
|
1491
|
+
#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
|
|
1492
|
+
#endif
|
|
1493
|
+
#define EIGEN_GEBP_ONESTEP(K) \
|
|
1494
|
+
do { \
|
|
1495
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
|
|
974
1496
|
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
975
|
-
internal::prefetch(blA+(3*K+16)*LhsProgress);
|
|
976
|
-
if (EIGEN_ARCH_ARM
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
traits.loadLhs(&blA[(
|
|
980
|
-
traits.
|
|
981
|
-
traits.
|
|
982
|
-
|
|
983
|
-
traits.
|
|
984
|
-
traits.
|
|
985
|
-
traits.madd(
|
|
986
|
-
traits.madd(
|
|
987
|
-
traits.
|
|
988
|
-
traits.
|
|
989
|
-
traits.madd(
|
|
990
|
-
traits.madd(
|
|
991
|
-
traits.
|
|
992
|
-
traits.
|
|
993
|
-
traits.madd(
|
|
994
|
-
traits.madd(
|
|
995
|
-
traits.
|
|
996
|
-
|
|
997
|
-
|
|
1497
|
+
internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
|
|
1498
|
+
if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
|
|
1499
|
+
internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
|
|
1500
|
+
} /* Bug 953 */ \
|
|
1501
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1502
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1503
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1504
|
+
EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
|
|
1505
|
+
traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1506
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1507
|
+
traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
|
|
1508
|
+
traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
|
|
1509
|
+
traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1510
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1511
|
+
traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
|
|
1512
|
+
traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
|
|
1513
|
+
traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1514
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1515
|
+
traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
|
|
1516
|
+
traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
|
|
1517
|
+
traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
|
|
1518
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1519
|
+
traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
|
|
1520
|
+
traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
|
|
1521
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
|
|
1522
|
+
} while (false)
|
|
998
1523
|
|
|
999
1524
|
internal::prefetch(blB);
|
|
1000
1525
|
EIGEN_GEBP_ONESTEP(0);
|
|
@@ -1014,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1014
1539
|
// process remaining peeled loop
|
|
1015
1540
|
for(Index k=peeled_kc; k<depth; k++)
|
|
1016
1541
|
{
|
|
1017
|
-
|
|
1542
|
+
RhsPanel15 rhs_panel;
|
|
1543
|
+
RhsPacket T0;
|
|
1018
1544
|
LhsPacket A2;
|
|
1019
1545
|
EIGEN_GEBP_ONESTEP(0);
|
|
1020
1546
|
blB += 4*RhsProgress;
|
|
@@ -1026,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1026
1552
|
ResPacket R0, R1, R2;
|
|
1027
1553
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1028
1554
|
|
|
1029
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1030
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1031
|
-
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
|
1555
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1556
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1557
|
+
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1032
1558
|
traits.acc(C0, alphav, R0);
|
|
1033
1559
|
traits.acc(C4, alphav, R1);
|
|
1034
1560
|
traits.acc(C8, alphav, R2);
|
|
@@ -1036,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1036
1562
|
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1037
1563
|
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1038
1564
|
|
|
1039
|
-
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
|
|
1040
|
-
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
|
|
1041
|
-
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
|
|
1565
|
+
R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1566
|
+
R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1567
|
+
R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1042
1568
|
traits.acc(C1, alphav, R0);
|
|
1043
1569
|
traits.acc(C5, alphav, R1);
|
|
1044
1570
|
traits.acc(C9, alphav, R2);
|
|
@@ -1046,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1046
1572
|
r1.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1047
1573
|
r1.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1048
1574
|
|
|
1049
|
-
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
|
1050
|
-
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
|
1051
|
-
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
|
|
1575
|
+
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1576
|
+
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1577
|
+
R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1052
1578
|
traits.acc(C2, alphav, R0);
|
|
1053
1579
|
traits.acc(C6, alphav, R1);
|
|
1054
1580
|
traits.acc(C10, alphav, R2);
|
|
@@ -1056,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1056
1582
|
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1057
1583
|
r2.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1058
1584
|
|
|
1059
|
-
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
|
|
1060
|
-
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
|
|
1061
|
-
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
|
|
1585
|
+
R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1586
|
+
R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1587
|
+
R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1062
1588
|
traits.acc(C3, alphav, R0);
|
|
1063
1589
|
traits.acc(C7, alphav, R1);
|
|
1064
1590
|
traits.acc(C11, alphav, R2);
|
|
@@ -1094,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1094
1620
|
{
|
|
1095
1621
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
|
|
1096
1622
|
RhsPacket B_0;
|
|
1097
|
-
#define EIGEN_GEBGP_ONESTEP(K)
|
|
1098
|
-
do {
|
|
1099
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");
|
|
1623
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1624
|
+
do { \
|
|
1625
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
|
|
1100
1626
|
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1101
|
-
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);
|
|
1102
|
-
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);
|
|
1103
|
-
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);
|
|
1104
|
-
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);
|
|
1105
|
-
traits.madd(A0, B_0, C0, B_0);
|
|
1106
|
-
traits.madd(A1, B_0, C4, B_0);
|
|
1107
|
-
traits.madd(A2, B_0, C8, B_0);
|
|
1108
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");
|
|
1109
|
-
} while(false)
|
|
1110
|
-
|
|
1627
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1628
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1629
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1630
|
+
traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
|
|
1631
|
+
traits.madd(A0, B_0, C0, B_0, fix<0>); \
|
|
1632
|
+
traits.madd(A1, B_0, C4, B_0, fix<0>); \
|
|
1633
|
+
traits.madd(A2, B_0, C8, B_0, fix<0>); \
|
|
1634
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
|
|
1635
|
+
} while (false)
|
|
1636
|
+
|
|
1111
1637
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1112
1638
|
EIGEN_GEBGP_ONESTEP(1);
|
|
1113
1639
|
EIGEN_GEBGP_ONESTEP(2);
|
|
@@ -1117,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1117
1643
|
EIGEN_GEBGP_ONESTEP(6);
|
|
1118
1644
|
EIGEN_GEBGP_ONESTEP(7);
|
|
1119
1645
|
|
|
1120
|
-
blB += pk*RhsProgress;
|
|
1121
|
-
blA += pk*3*Traits::LhsProgress;
|
|
1646
|
+
blB += int(pk) * int(RhsProgress);
|
|
1647
|
+
blA += int(pk) * 3 * int(Traits::LhsProgress);
|
|
1122
1648
|
|
|
1123
1649
|
EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
|
|
1124
1650
|
}
|
|
@@ -1135,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1135
1661
|
ResPacket R0, R1, R2;
|
|
1136
1662
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1137
1663
|
|
|
1138
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1139
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1140
|
-
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
|
|
1664
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1665
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1666
|
+
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1141
1667
|
traits.acc(C0, alphav, R0);
|
|
1142
1668
|
traits.acc(C4, alphav, R1);
|
|
1143
1669
|
traits.acc(C8, alphav, R2);
|
|
@@ -1196,7 +1722,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1196
1722
|
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1197
1723
|
{
|
|
1198
1724
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
|
|
1199
|
-
|
|
1725
|
+
RhsPacketx4 rhs_panel;
|
|
1726
|
+
RhsPacket T0;
|
|
1200
1727
|
|
|
1201
1728
|
// NOTE: the begin/end asm comments below work around bug 935!
|
|
1202
1729
|
// but they are not enough for gcc>=6 without FMA (bug 1637)
|
|
@@ -1205,24 +1732,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1205
1732
|
#else
|
|
1206
1733
|
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
|
1207
1734
|
#endif
|
|
1208
|
-
|
|
1209
|
-
do {
|
|
1210
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");
|
|
1211
|
-
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);
|
|
1212
|
-
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);
|
|
1213
|
-
traits.
|
|
1214
|
-
traits.madd(A0,
|
|
1215
|
-
traits.madd(A1,
|
|
1216
|
-
traits.madd(A0,
|
|
1217
|
-
traits.madd(A1,
|
|
1218
|
-
traits.madd(A0,
|
|
1219
|
-
traits.madd(A1,
|
|
1220
|
-
traits.madd(A0,
|
|
1221
|
-
traits.madd(A1,
|
|
1222
|
-
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
|
1223
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");
|
|
1224
|
-
} while(false)
|
|
1225
|
-
|
|
1735
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1736
|
+
do { \
|
|
1737
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
|
|
1738
|
+
traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
|
|
1739
|
+
traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
|
|
1740
|
+
traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
|
|
1741
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1742
|
+
traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
|
|
1743
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1744
|
+
traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
|
|
1745
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1746
|
+
traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
|
|
1747
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1748
|
+
traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
|
|
1749
|
+
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
|
|
1750
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
|
|
1751
|
+
} while (false)
|
|
1752
|
+
|
|
1226
1753
|
internal::prefetch(blB+(48+0));
|
|
1227
1754
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1228
1755
|
EIGEN_GEBGP_ONESTEP(1);
|
|
@@ -1242,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1242
1769
|
// process remaining peeled loop
|
|
1243
1770
|
for(Index k=peeled_kc; k<depth; k++)
|
|
1244
1771
|
{
|
|
1245
|
-
|
|
1772
|
+
RhsPacketx4 rhs_panel;
|
|
1773
|
+
RhsPacket T0;
|
|
1246
1774
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1247
1775
|
blB += 4*RhsProgress;
|
|
1248
1776
|
blA += 2*Traits::LhsProgress;
|
|
@@ -1252,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1252
1780
|
ResPacket R0, R1, R2, R3;
|
|
1253
1781
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1254
1782
|
|
|
1255
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1256
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1257
|
-
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
|
|
1258
|
-
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
|
|
1783
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1784
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1785
|
+
R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1786
|
+
R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1259
1787
|
traits.acc(C0, alphav, R0);
|
|
1260
1788
|
traits.acc(C4, alphav, R1);
|
|
1261
1789
|
traits.acc(C1, alphav, R2);
|
|
@@ -1265,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1265
1793
|
r1.storePacket(0 * Traits::ResPacketSize, R2);
|
|
1266
1794
|
r1.storePacket(1 * Traits::ResPacketSize, R3);
|
|
1267
1795
|
|
|
1268
|
-
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
|
1269
|
-
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
|
|
1270
|
-
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
|
|
1271
|
-
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
|
|
1796
|
+
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1797
|
+
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1798
|
+
R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1799
|
+
R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1272
1800
|
traits.acc(C2, alphav, R0);
|
|
1273
1801
|
traits.acc(C6, alphav, R1);
|
|
1274
1802
|
traits.acc(C3, alphav, R2);
|
|
@@ -1313,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1313
1841
|
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
|
|
1314
1842
|
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
|
|
1315
1843
|
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
|
1316
|
-
traits.madd(A0, B_0, C0, B1);
|
|
1317
|
-
traits.madd(A1, B_0, C4, B_0);
|
|
1844
|
+
traits.madd(A0, B_0, C0, B1, fix<0>); \
|
|
1845
|
+
traits.madd(A1, B_0, C4, B_0, fix<0>); \
|
|
1318
1846
|
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
|
|
1319
1847
|
} while(false)
|
|
1320
1848
|
|
|
@@ -1327,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1327
1855
|
EIGEN_GEBGP_ONESTEP(6);
|
|
1328
1856
|
EIGEN_GEBGP_ONESTEP(7);
|
|
1329
1857
|
|
|
1330
|
-
blB += pk*RhsProgress;
|
|
1331
|
-
blA += pk*2*Traits::LhsProgress;
|
|
1858
|
+
blB += int(pk) * int(RhsProgress);
|
|
1859
|
+
blA += int(pk) * 2 * int(Traits::LhsProgress);
|
|
1332
1860
|
|
|
1333
1861
|
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
|
|
1334
1862
|
}
|
|
@@ -1345,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1345
1873
|
ResPacket R0, R1;
|
|
1346
1874
|
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1347
1875
|
|
|
1348
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1349
|
-
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
|
|
1876
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1877
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1350
1878
|
traits.acc(C0, alphav, R0);
|
|
1351
1879
|
traits.acc(C4, alphav, R1);
|
|
1352
1880
|
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
@@ -1358,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1358
1886
|
//---------- Process 1 * LhsProgress rows at once ----------
|
|
1359
1887
|
if(mr>=1*Traits::LhsProgress)
|
|
1360
1888
|
{
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
traits.initAcc(C0);
|
|
1376
|
-
traits.initAcc(C1);
|
|
1377
|
-
traits.initAcc(C2);
|
|
1378
|
-
traits.initAcc(C3);
|
|
1379
|
-
|
|
1380
|
-
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1381
|
-
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1382
|
-
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1383
|
-
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1384
|
-
|
|
1385
|
-
r0.prefetch(prefetch_res_offset);
|
|
1386
|
-
r1.prefetch(prefetch_res_offset);
|
|
1387
|
-
r2.prefetch(prefetch_res_offset);
|
|
1388
|
-
r3.prefetch(prefetch_res_offset);
|
|
1389
|
-
|
|
1390
|
-
// performs "inner" products
|
|
1391
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
|
1392
|
-
prefetch(&blB[0]);
|
|
1393
|
-
LhsPacket A0;
|
|
1394
|
-
|
|
1395
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1396
|
-
{
|
|
1397
|
-
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
|
|
1398
|
-
RhsPacket B_0, B1, B2, B3;
|
|
1399
|
-
|
|
1400
|
-
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1401
|
-
do { \
|
|
1402
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
|
|
1403
|
-
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1404
|
-
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
|
|
1405
|
-
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
|
|
1406
|
-
traits.madd(A0, B_0, C0, B_0); \
|
|
1407
|
-
traits.madd(A0, B1, C1, B1); \
|
|
1408
|
-
traits.madd(A0, B2, C2, B2); \
|
|
1409
|
-
traits.madd(A0, B3, C3, B3); \
|
|
1410
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
|
|
1411
|
-
} while(false)
|
|
1412
|
-
|
|
1413
|
-
internal::prefetch(blB+(48+0));
|
|
1414
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1415
|
-
EIGEN_GEBGP_ONESTEP(1);
|
|
1416
|
-
EIGEN_GEBGP_ONESTEP(2);
|
|
1417
|
-
EIGEN_GEBGP_ONESTEP(3);
|
|
1418
|
-
internal::prefetch(blB+(48+16));
|
|
1419
|
-
EIGEN_GEBGP_ONESTEP(4);
|
|
1420
|
-
EIGEN_GEBGP_ONESTEP(5);
|
|
1421
|
-
EIGEN_GEBGP_ONESTEP(6);
|
|
1422
|
-
EIGEN_GEBGP_ONESTEP(7);
|
|
1423
|
-
|
|
1424
|
-
blB += pk*4*RhsProgress;
|
|
1425
|
-
blA += pk*1*LhsProgress;
|
|
1426
|
-
|
|
1427
|
-
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
|
|
1428
|
-
}
|
|
1429
|
-
// process remaining peeled loop
|
|
1430
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1431
|
-
{
|
|
1432
|
-
RhsPacket B_0, B1, B2, B3;
|
|
1433
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1434
|
-
blB += 4*RhsProgress;
|
|
1435
|
-
blA += 1*LhsProgress;
|
|
1436
|
-
}
|
|
1437
|
-
#undef EIGEN_GEBGP_ONESTEP
|
|
1438
|
-
|
|
1439
|
-
ResPacket R0, R1;
|
|
1440
|
-
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1441
|
-
|
|
1442
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1443
|
-
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
|
|
1444
|
-
traits.acc(C0, alphav, R0);
|
|
1445
|
-
traits.acc(C1, alphav, R1);
|
|
1446
|
-
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1447
|
-
r1.storePacket(0 * Traits::ResPacketSize, R1);
|
|
1448
|
-
|
|
1449
|
-
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
|
|
1450
|
-
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
|
|
1451
|
-
traits.acc(C2, alphav, R0);
|
|
1452
|
-
traits.acc(C3, alphav, R1);
|
|
1453
|
-
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1454
|
-
r3.storePacket(0 * Traits::ResPacketSize, R1);
|
|
1455
|
-
}
|
|
1456
|
-
|
|
1457
|
-
// Deal with remaining columns of the rhs
|
|
1458
|
-
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1459
|
-
{
|
|
1460
|
-
// One column at a time
|
|
1461
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
|
|
1462
|
-
prefetch(&blA[0]);
|
|
1463
|
-
|
|
1464
|
-
// gets res block as register
|
|
1465
|
-
AccPacket C0;
|
|
1466
|
-
traits.initAcc(C0);
|
|
1467
|
-
|
|
1468
|
-
LinearMapper r0 = res.getLinearMapper(i, j2);
|
|
1469
|
-
|
|
1470
|
-
// performs "inner" products
|
|
1471
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
1472
|
-
LhsPacket A0;
|
|
1473
|
-
|
|
1474
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1475
|
-
{
|
|
1476
|
-
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
|
|
1477
|
-
RhsPacket B_0;
|
|
1478
|
-
|
|
1479
|
-
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1480
|
-
do { \
|
|
1481
|
-
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
|
|
1482
|
-
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1483
|
-
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
|
|
1484
|
-
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
|
|
1485
|
-
traits.madd(A0, B_0, C0, B_0); \
|
|
1486
|
-
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
|
|
1487
|
-
} while(false);
|
|
1488
|
-
|
|
1489
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1490
|
-
EIGEN_GEBGP_ONESTEP(1);
|
|
1491
|
-
EIGEN_GEBGP_ONESTEP(2);
|
|
1492
|
-
EIGEN_GEBGP_ONESTEP(3);
|
|
1493
|
-
EIGEN_GEBGP_ONESTEP(4);
|
|
1494
|
-
EIGEN_GEBGP_ONESTEP(5);
|
|
1495
|
-
EIGEN_GEBGP_ONESTEP(6);
|
|
1496
|
-
EIGEN_GEBGP_ONESTEP(7);
|
|
1497
|
-
|
|
1498
|
-
blB += pk*RhsProgress;
|
|
1499
|
-
blA += pk*1*Traits::LhsProgress;
|
|
1500
|
-
|
|
1501
|
-
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
|
|
1502
|
-
}
|
|
1503
|
-
|
|
1504
|
-
// process remaining peeled loop
|
|
1505
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1506
|
-
{
|
|
1507
|
-
RhsPacket B_0;
|
|
1508
|
-
EIGEN_GEBGP_ONESTEP(0);
|
|
1509
|
-
blB += RhsProgress;
|
|
1510
|
-
blA += 1*Traits::LhsProgress;
|
|
1511
|
-
}
|
|
1512
|
-
#undef EIGEN_GEBGP_ONESTEP
|
|
1513
|
-
ResPacket R0;
|
|
1514
|
-
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1515
|
-
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
|
|
1516
|
-
traits.acc(C0, alphav, R0);
|
|
1517
|
-
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1518
|
-
}
|
|
1519
|
-
}
|
|
1889
|
+
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
|
|
1890
|
+
p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
1891
|
+
}
|
|
1892
|
+
//---------- Process LhsProgressHalf rows at once ----------
|
|
1893
|
+
if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
|
|
1894
|
+
{
|
|
1895
|
+
lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
|
|
1896
|
+
p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
1897
|
+
}
|
|
1898
|
+
//---------- Process LhsProgressQuarter rows at once ----------
|
|
1899
|
+
if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
|
|
1900
|
+
{
|
|
1901
|
+
lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
|
|
1902
|
+
p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
1520
1903
|
}
|
|
1521
1904
|
//---------- Process remaining rows, 1 at once ----------
|
|
1522
|
-
if(
|
|
1905
|
+
if(peeled_mc_quarter<rows)
|
|
1523
1906
|
{
|
|
1524
1907
|
// loop on each panel of the rhs
|
|
1525
1908
|
for(Index j2=0; j2<packet_cols4; j2+=nr)
|
|
1526
1909
|
{
|
|
1527
1910
|
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
1528
|
-
for(Index i=
|
|
1911
|
+
for(Index i=peeled_mc_quarter; i<rows; i+=1)
|
|
1529
1912
|
{
|
|
1530
1913
|
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
|
1531
1914
|
prefetch(&blA[0]);
|
|
1532
1915
|
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
|
|
1533
1916
|
|
|
1534
|
-
//
|
|
1535
|
-
//
|
|
1536
|
-
//
|
|
1917
|
+
// If LhsProgress is 8 or 16, it assumes that there is a
|
|
1918
|
+
// half or quarter packet, respectively, of the same size as
|
|
1919
|
+
// nr (which is currently 4) for the return type.
|
|
1537
1920
|
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
|
1921
|
+
const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
|
|
1538
1922
|
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
|
1539
|
-
(SwappedTraits::LhsProgress
|
|
1540
|
-
(SwappedTraits::LhsProgress!=8
|
|
1923
|
+
(SwappedTraits::LhsProgress<=16) &&
|
|
1924
|
+
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
|
|
1925
|
+
(SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
|
|
1541
1926
|
{
|
|
1542
1927
|
SAccPacket C0, C1, C2, C3;
|
|
1543
1928
|
straits.initAcc(C0);
|
|
@@ -1560,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1560
1945
|
|
|
1561
1946
|
straits.loadRhsQuad(blA+0*spk, B_0);
|
|
1562
1947
|
straits.loadRhsQuad(blA+1*spk, B_1);
|
|
1563
|
-
straits.madd(A0,B_0,C0,B_0);
|
|
1564
|
-
straits.madd(A1,B_1,C1,B_1);
|
|
1948
|
+
straits.madd(A0,B_0,C0,B_0, fix<0>);
|
|
1949
|
+
straits.madd(A1,B_1,C1,B_1, fix<0>);
|
|
1565
1950
|
|
|
1566
1951
|
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
|
|
1567
1952
|
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
|
|
1568
1953
|
straits.loadRhsQuad(blA+2*spk, B_0);
|
|
1569
1954
|
straits.loadRhsQuad(blA+3*spk, B_1);
|
|
1570
|
-
straits.madd(A0,B_0,C2,B_0);
|
|
1571
|
-
straits.madd(A1,B_1,C3,B_1);
|
|
1955
|
+
straits.madd(A0,B_0,C2,B_0, fix<0>);
|
|
1956
|
+
straits.madd(A1,B_1,C3,B_1, fix<0>);
|
|
1572
1957
|
|
|
1573
1958
|
blB += 4*SwappedTraits::LhsProgress;
|
|
1574
1959
|
blA += 4*spk;
|
|
@@ -1581,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1581
1966
|
|
|
1582
1967
|
straits.loadLhsUnaligned(blB, A0);
|
|
1583
1968
|
straits.loadRhsQuad(blA, B_0);
|
|
1584
|
-
straits.madd(A0,B_0,C0,B_0);
|
|
1969
|
+
straits.madd(A0,B_0,C0,B_0, fix<0>);
|
|
1585
1970
|
|
|
1586
1971
|
blB += SwappedTraits::LhsProgress;
|
|
1587
1972
|
blA += spk;
|
|
@@ -1591,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1591
1976
|
// Special case where we have to first reduce the accumulation register C0
|
|
1592
1977
|
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
|
|
1593
1978
|
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
|
|
1594
|
-
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<
|
|
1979
|
+
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
|
|
1595
1980
|
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
|
|
1596
1981
|
|
|
1597
1982
|
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
|
|
@@ -1604,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1604
1989
|
SRhsPacketHalf b0;
|
|
1605
1990
|
straits.loadLhsUnaligned(blB, a0);
|
|
1606
1991
|
straits.loadRhs(blA, b0);
|
|
1607
|
-
SAccPacketHalf c0 =
|
|
1608
|
-
straits.madd(a0,b0,c0,b0);
|
|
1992
|
+
SAccPacketHalf c0 = predux_half_dowto4(C0);
|
|
1993
|
+
straits.madd(a0,b0,c0,b0, fix<0>);
|
|
1609
1994
|
straits.acc(c0, alphav, R);
|
|
1610
1995
|
}
|
|
1611
1996
|
else
|
|
1612
1997
|
{
|
|
1613
|
-
straits.acc(
|
|
1998
|
+
straits.acc(predux_half_dowto4(C0), alphav, R);
|
|
1614
1999
|
}
|
|
1615
2000
|
res.scatterPacket(i, j2, R);
|
|
1616
2001
|
}
|
|
2002
|
+
else if (SwappedTraits::LhsProgress==16)
|
|
2003
|
+
{
|
|
2004
|
+
// Special case where we have to first reduce the
|
|
2005
|
+
// accumulation register C0. We specialize the block in
|
|
2006
|
+
// template form, so that LhsProgress < 16 paths don't
|
|
2007
|
+
// fail to compile
|
|
2008
|
+
last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
|
|
2009
|
+
p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
|
|
2010
|
+
}
|
|
1617
2011
|
else
|
|
1618
2012
|
{
|
|
1619
2013
|
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
|
|
@@ -1636,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1636
2030
|
|
|
1637
2031
|
B_0 = blB[0];
|
|
1638
2032
|
B_1 = blB[1];
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
2033
|
+
C0 = cj.pmadd(A0,B_0,C0);
|
|
2034
|
+
C1 = cj.pmadd(A0,B_1,C1);
|
|
2035
|
+
|
|
1642
2036
|
B_0 = blB[2];
|
|
1643
2037
|
B_1 = blB[3];
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
2038
|
+
C2 = cj.pmadd(A0,B_0,C2);
|
|
2039
|
+
C3 = cj.pmadd(A0,B_1,C3);
|
|
2040
|
+
|
|
1647
2041
|
blB += 4;
|
|
1648
2042
|
}
|
|
1649
2043
|
res(i, j2 + 0) += alpha * C0;
|
|
@@ -1657,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1657
2051
|
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1658
2052
|
{
|
|
1659
2053
|
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
1660
|
-
for(Index i=
|
|
2054
|
+
for(Index i=peeled_mc_quarter; i<rows; i+=1)
|
|
1661
2055
|
{
|
|
1662
2056
|
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
|
1663
2057
|
prefetch(&blA[0]);
|
|
@@ -1668,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1668
2062
|
{
|
|
1669
2063
|
LhsScalar A0 = blA[k];
|
|
1670
2064
|
RhsScalar B_0 = blB[k];
|
|
1671
|
-
|
|
2065
|
+
C0 = cj.pmadd(A0, B_0, C0);
|
|
1672
2066
|
}
|
|
1673
2067
|
res(i, j2) += alpha * C0;
|
|
1674
2068
|
}
|
|
@@ -1677,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1677
2071
|
}
|
|
1678
2072
|
|
|
1679
2073
|
|
|
1680
|
-
#undef CJMADD
|
|
1681
|
-
|
|
1682
2074
|
// pack a block of the lhs
|
|
1683
2075
|
// The traversal is as follow (mr==4):
|
|
1684
2076
|
// 0 4 8 12 ...
|
|
@@ -1693,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1693
2085
|
//
|
|
1694
2086
|
// 32 33 34 35 ...
|
|
1695
2087
|
// 36 36 38 39 ...
|
|
1696
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1697
|
-
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
|
2088
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2089
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
|
|
1698
2090
|
{
|
|
1699
2091
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
1700
2092
|
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
|
1701
2093
|
};
|
|
1702
2094
|
|
|
1703
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1704
|
-
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
|
|
2095
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2096
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
|
|
1705
2097
|
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
|
1706
2098
|
{
|
|
1707
|
-
typedef typename
|
|
1708
|
-
|
|
2099
|
+
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2100
|
+
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2101
|
+
enum { PacketSize = unpacket_traits<Packet>::size,
|
|
2102
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2103
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
2104
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
2105
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
|
|
1709
2106
|
|
|
1710
2107
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
|
1711
2108
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
@@ -1717,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1717
2114
|
|
|
1718
2115
|
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
|
1719
2116
|
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
|
1720
|
-
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
|
1721
|
-
const Index
|
|
1722
|
-
|
|
2117
|
+
const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
|
|
2118
|
+
const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
|
|
2119
|
+
const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
|
|
2120
|
+
const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
|
|
2121
|
+
const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
|
|
2122
|
+
: Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
|
|
1723
2123
|
|
|
1724
2124
|
Index i=0;
|
|
1725
2125
|
|
|
@@ -1733,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1733
2133
|
for(Index k=0; k<depth; k++)
|
|
1734
2134
|
{
|
|
1735
2135
|
Packet A, B, C;
|
|
1736
|
-
A = lhs.loadPacket(i+0*PacketSize, k);
|
|
1737
|
-
B = lhs.loadPacket(i+1*PacketSize, k);
|
|
1738
|
-
C = lhs.loadPacket(i+2*PacketSize, k);
|
|
2136
|
+
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2137
|
+
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
|
|
2138
|
+
C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
|
|
1739
2139
|
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
|
1740
2140
|
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
|
1741
2141
|
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
|
|
@@ -1753,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1753
2153
|
for(Index k=0; k<depth; k++)
|
|
1754
2154
|
{
|
|
1755
2155
|
Packet A, B;
|
|
1756
|
-
A = lhs.loadPacket(i+0*PacketSize, k);
|
|
1757
|
-
B = lhs.loadPacket(i+1*PacketSize, k);
|
|
2156
|
+
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2157
|
+
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
|
|
1758
2158
|
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
|
|
1759
2159
|
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
|
|
1760
2160
|
}
|
|
@@ -1771,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1771
2171
|
for(Index k=0; k<depth; k++)
|
|
1772
2172
|
{
|
|
1773
2173
|
Packet A;
|
|
1774
|
-
A = lhs.loadPacket(i+0*PacketSize, k);
|
|
2174
|
+
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
1775
2175
|
pstore(blockA+count, cj.pconj(A));
|
|
1776
2176
|
count+=PacketSize;
|
|
1777
2177
|
}
|
|
1778
2178
|
if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
|
|
1779
2179
|
}
|
|
1780
2180
|
}
|
|
1781
|
-
// Pack
|
|
2181
|
+
// Pack half packets
|
|
2182
|
+
if(HasHalf && Pack1>=HalfPacketSize)
|
|
2183
|
+
{
|
|
2184
|
+
for(; i<peeled_mc_half; i+=HalfPacketSize)
|
|
2185
|
+
{
|
|
2186
|
+
if(PanelMode) count += (HalfPacketSize) * offset;
|
|
2187
|
+
|
|
2188
|
+
for(Index k=0; k<depth; k++)
|
|
2189
|
+
{
|
|
2190
|
+
HalfPacket A;
|
|
2191
|
+
A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
|
|
2192
|
+
pstoreu(blockA+count, cj.pconj(A));
|
|
2193
|
+
count+=HalfPacketSize;
|
|
2194
|
+
}
|
|
2195
|
+
if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
// Pack quarter packets
|
|
2199
|
+
if(HasQuarter && Pack1>=QuarterPacketSize)
|
|
2200
|
+
{
|
|
2201
|
+
for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
|
|
2202
|
+
{
|
|
2203
|
+
if(PanelMode) count += (QuarterPacketSize) * offset;
|
|
2204
|
+
|
|
2205
|
+
for(Index k=0; k<depth; k++)
|
|
2206
|
+
{
|
|
2207
|
+
QuarterPacket A;
|
|
2208
|
+
A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
|
|
2209
|
+
pstoreu(blockA+count, cj.pconj(A));
|
|
2210
|
+
count+=QuarterPacketSize;
|
|
2211
|
+
}
|
|
2212
|
+
if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
|
|
2213
|
+
}
|
|
2214
|
+
}
|
|
2215
|
+
// Pack2 may be *smaller* than PacketSize—that happens for
|
|
2216
|
+
// products like real * complex, where we have to go half the
|
|
2217
|
+
// progress on the lhs in order to duplicate those operands to
|
|
2218
|
+
// address both real & imaginary parts on the rhs. This portion will
|
|
2219
|
+
// pack those half ones until they match the number expected on the
|
|
2220
|
+
// last peeling loop at this point (for the rhs).
|
|
1782
2221
|
if(Pack2<PacketSize && Pack2>1)
|
|
1783
2222
|
{
|
|
1784
|
-
for(; i<peeled_mc0; i+=
|
|
2223
|
+
for(; i<peeled_mc0; i+=last_lhs_progress)
|
|
1785
2224
|
{
|
|
1786
|
-
if(PanelMode) count +=
|
|
2225
|
+
if(PanelMode) count += last_lhs_progress * offset;
|
|
1787
2226
|
|
|
1788
2227
|
for(Index k=0; k<depth; k++)
|
|
1789
|
-
for(Index w=0; w<
|
|
2228
|
+
for(Index w=0; w<last_lhs_progress; w++)
|
|
1790
2229
|
blockA[count++] = cj(lhs(i+w, k));
|
|
1791
2230
|
|
|
1792
|
-
if(PanelMode) count +=
|
|
2231
|
+
if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
|
|
1793
2232
|
}
|
|
1794
2233
|
}
|
|
2234
|
+
// Pack scalars
|
|
1795
2235
|
for(; i<rows; i++)
|
|
1796
2236
|
{
|
|
1797
2237
|
if(PanelMode) count += offset;
|
|
@@ -1801,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
|
|
|
1801
2241
|
}
|
|
1802
2242
|
}
|
|
1803
2243
|
|
|
1804
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1805
|
-
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
|
2244
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2245
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
|
|
1806
2246
|
{
|
|
1807
2247
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
1808
2248
|
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
|
|
1809
2249
|
};
|
|
1810
2250
|
|
|
1811
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
|
|
1812
|
-
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
|
|
2251
|
+
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
|
|
2252
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
|
|
1813
2253
|
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
|
|
1814
2254
|
{
|
|
1815
|
-
typedef typename
|
|
1816
|
-
|
|
2255
|
+
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2256
|
+
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2257
|
+
enum { PacketSize = unpacket_traits<Packet>::size,
|
|
2258
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2259
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
2260
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
2261
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
|
|
1817
2262
|
|
|
1818
2263
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
|
1819
2264
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
@@ -1821,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
|
|
|
1821
2266
|
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
1822
2267
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
1823
2268
|
Index count = 0;
|
|
2269
|
+
bool gone_half = false, gone_quarter = false, gone_last = false;
|
|
1824
2270
|
|
|
1825
|
-
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
|
1826
|
-
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
|
1827
|
-
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
|
1828
|
-
|
|
1829
|
-
int pack = Pack1;
|
|
1830
2271
|
Index i = 0;
|
|
2272
|
+
int pack = Pack1;
|
|
2273
|
+
int psize = PacketSize;
|
|
1831
2274
|
while(pack>0)
|
|
1832
2275
|
{
|
|
1833
2276
|
Index remaining_rows = rows-i;
|
|
1834
|
-
Index peeled_mc = i+(remaining_rows/pack)*pack;
|
|
2277
|
+
Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
|
|
2278
|
+
Index starting_pos = i;
|
|
1835
2279
|
for(; i<peeled_mc; i+=pack)
|
|
1836
2280
|
{
|
|
1837
2281
|
if(PanelMode) count += pack * offset;
|
|
1838
2282
|
|
|
1839
|
-
const Index peeled_k = (depth/PacketSize)*PacketSize;
|
|
1840
2283
|
Index k=0;
|
|
1841
|
-
if(pack>=
|
|
2284
|
+
if(pack>=psize && psize >= QuarterPacketSize)
|
|
1842
2285
|
{
|
|
1843
|
-
|
|
2286
|
+
const Index peeled_k = (depth/psize)*psize;
|
|
2287
|
+
for(; k<peeled_k; k+=psize)
|
|
1844
2288
|
{
|
|
1845
|
-
for (Index m = 0; m < pack; m +=
|
|
2289
|
+
for (Index m = 0; m < pack; m += psize)
|
|
1846
2290
|
{
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
2291
|
+
if (psize == PacketSize) {
|
|
2292
|
+
PacketBlock<Packet> kernel;
|
|
2293
|
+
for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
|
|
2294
|
+
ptranspose(kernel);
|
|
2295
|
+
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
|
|
2296
|
+
} else if (HasHalf && psize == HalfPacketSize) {
|
|
2297
|
+
gone_half = true;
|
|
2298
|
+
PacketBlock<HalfPacket> kernel_half;
|
|
2299
|
+
for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
|
|
2300
|
+
ptranspose(kernel_half);
|
|
2301
|
+
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
|
|
2302
|
+
} else if (HasQuarter && psize == QuarterPacketSize) {
|
|
2303
|
+
gone_quarter = true;
|
|
2304
|
+
PacketBlock<QuarterPacket> kernel_quarter;
|
|
2305
|
+
for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
|
|
2306
|
+
ptranspose(kernel_quarter);
|
|
2307
|
+
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
|
|
2308
|
+
}
|
|
1851
2309
|
}
|
|
1852
|
-
count +=
|
|
2310
|
+
count += psize*pack;
|
|
1853
2311
|
}
|
|
1854
2312
|
}
|
|
2313
|
+
|
|
1855
2314
|
for(; k<depth; k++)
|
|
1856
2315
|
{
|
|
1857
2316
|
Index w=0;
|
|
@@ -1874,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
|
|
|
1874
2333
|
if(PanelMode) count += pack * (stride-offset-depth);
|
|
1875
2334
|
}
|
|
1876
2335
|
|
|
1877
|
-
pack -=
|
|
1878
|
-
|
|
1879
|
-
|
|
2336
|
+
pack -= psize;
|
|
2337
|
+
Index left = rows - i;
|
|
2338
|
+
if (pack <= 0) {
|
|
2339
|
+
if (!gone_last &&
|
|
2340
|
+
(starting_pos == i || left >= psize/2 || left >= psize/4) &&
|
|
2341
|
+
((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
|
|
2342
|
+
(psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
|
|
2343
|
+
psize /= 2;
|
|
2344
|
+
pack = psize;
|
|
2345
|
+
continue;
|
|
2346
|
+
}
|
|
2347
|
+
// Pack2 may be *smaller* than PacketSize—that happens for
|
|
2348
|
+
// products like real * complex, where we have to go half the
|
|
2349
|
+
// progress on the lhs in order to duplicate those operands to
|
|
2350
|
+
// address both real & imaginary parts on the rhs. This portion will
|
|
2351
|
+
// pack those half ones until they match the number expected on the
|
|
2352
|
+
// last peeling loop at this point (for the rhs).
|
|
2353
|
+
if (Pack2 < PacketSize && !gone_last) {
|
|
2354
|
+
gone_last = true;
|
|
2355
|
+
psize = pack = left & ~1;
|
|
2356
|
+
}
|
|
2357
|
+
}
|
|
1880
2358
|
}
|
|
1881
2359
|
|
|
1882
2360
|
for(; i<rows; i++)
|
|
@@ -1932,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
|
|
|
1932
2410
|
// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
|
|
1933
2411
|
// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
|
|
1934
2412
|
// Index k=0;
|
|
1935
|
-
// if(PacketSize==8) // TODO
|
|
2413
|
+
// if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
|
|
1936
2414
|
// {
|
|
1937
2415
|
// for(; k<peeled_k; k+=PacketSize) {
|
|
1938
2416
|
// PacketBlock<Packet> kernel;
|
|
@@ -1979,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
|
|
|
1979
2457
|
{
|
|
1980
2458
|
for(; k<peeled_k; k+=PacketSize) {
|
|
1981
2459
|
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
|
|
1982
|
-
kernel.packet[0] = dm0.loadPacket(k);
|
|
1983
|
-
kernel.packet[1%PacketSize] = dm1.loadPacket(k);
|
|
1984
|
-
kernel.packet[2%PacketSize] = dm2.loadPacket(k);
|
|
1985
|
-
kernel.packet[3%PacketSize] = dm3.loadPacket(k);
|
|
2460
|
+
kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
|
|
2461
|
+
kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2462
|
+
kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2463
|
+
kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
1986
2464
|
ptranspose(kernel);
|
|
1987
2465
|
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
|
|
1988
2466
|
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
|
|
@@ -2023,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
|
|
|
2023
2501
|
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
|
2024
2502
|
{
|
|
2025
2503
|
typedef typename packet_traits<Scalar>::type Packet;
|
|
2504
|
+
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2505
|
+
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2026
2506
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
2027
|
-
enum { PacketSize = packet_traits<Scalar>::size
|
|
2028
|
-
|
|
2029
|
-
};
|
|
2030
|
-
|
|
2031
|
-
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2032
|
-
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
|
2033
|
-
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
|
|
2034
|
-
{
|
|
2035
|
-
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
|
2036
|
-
EIGEN_UNUSED_VARIABLE(stride);
|
|
2037
|
-
EIGEN_UNUSED_VARIABLE(offset);
|
|
2038
|
-
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2039
|
-
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2040
|
-
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
|
2041
|
-
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
2042
|
-
Index count = 0;
|
|
2043
|
-
|
|
2044
|
-
// if(nr>=8)
|
|
2045
|
-
// {
|
|
2046
|
-
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
|
2047
|
-
// {
|
|
2048
|
-
// // skip what we have before
|
|
2049
|
-
// if(PanelMode) count += 8 * offset;
|
|
2050
|
-
// for(Index k=0; k<depth; k++)
|
|
2051
|
-
// {
|
|
2052
|
-
// if (PacketSize==8) {
|
|
2053
|
-
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2054
|
-
// pstoreu(blockB+count, cj.pconj(A));
|
|
2055
|
-
// } else if (PacketSize==4) {
|
|
2056
|
-
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2057
|
-
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
|
|
2058
|
-
// pstoreu(blockB+count, cj.pconj(A));
|
|
2059
|
-
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
|
|
2060
|
-
// } else {
|
|
2061
|
-
// const Scalar* b0 = &rhs[k*rhsStride + j2];
|
|
2062
|
-
// blockB[count+0] = cj(b0[0]);
|
|
2063
|
-
// blockB[count+1] = cj(b0[1]);
|
|
2064
|
-
// blockB[count+2] = cj(b0[2]);
|
|
2065
|
-
// blockB[count+3] = cj(b0[3]);
|
|
2066
|
-
// blockB[count+4] = cj(b0[4]);
|
|
2067
|
-
// blockB[count+5] = cj(b0[5]);
|
|
2068
|
-
// blockB[count+6] = cj(b0[6]);
|
|
2069
|
-
// blockB[count+7] = cj(b0[7]);
|
|
2070
|
-
// }
|
|
2071
|
-
// count += 8;
|
|
2072
|
-
// }
|
|
2073
|
-
// // skip what we have after
|
|
2074
|
-
// if(PanelMode) count += 8 * (stride-offset-depth);
|
|
2075
|
-
// }
|
|
2076
|
-
// }
|
|
2077
|
-
if(nr>=4)
|
|
2507
|
+
enum { PacketSize = packet_traits<Scalar>::size,
|
|
2508
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2509
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
|
|
2510
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
|
|
2078
2511
|
{
|
|
2079
|
-
|
|
2512
|
+
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
|
2513
|
+
EIGEN_UNUSED_VARIABLE(stride);
|
|
2514
|
+
EIGEN_UNUSED_VARIABLE(offset);
|
|
2515
|
+
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2516
|
+
const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
|
|
2517
|
+
const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
|
|
2518
|
+
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2519
|
+
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
|
2520
|
+
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
2521
|
+
Index count = 0;
|
|
2522
|
+
|
|
2523
|
+
// if(nr>=8)
|
|
2524
|
+
// {
|
|
2525
|
+
// for(Index j2=0; j2<packet_cols8; j2+=8)
|
|
2526
|
+
// {
|
|
2527
|
+
// // skip what we have before
|
|
2528
|
+
// if(PanelMode) count += 8 * offset;
|
|
2529
|
+
// for(Index k=0; k<depth; k++)
|
|
2530
|
+
// {
|
|
2531
|
+
// if (PacketSize==8) {
|
|
2532
|
+
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2533
|
+
// pstoreu(blockB+count, cj.pconj(A));
|
|
2534
|
+
// } else if (PacketSize==4) {
|
|
2535
|
+
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2536
|
+
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
|
|
2537
|
+
// pstoreu(blockB+count, cj.pconj(A));
|
|
2538
|
+
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
|
|
2539
|
+
// } else {
|
|
2540
|
+
// const Scalar* b0 = &rhs[k*rhsStride + j2];
|
|
2541
|
+
// blockB[count+0] = cj(b0[0]);
|
|
2542
|
+
// blockB[count+1] = cj(b0[1]);
|
|
2543
|
+
// blockB[count+2] = cj(b0[2]);
|
|
2544
|
+
// blockB[count+3] = cj(b0[3]);
|
|
2545
|
+
// blockB[count+4] = cj(b0[4]);
|
|
2546
|
+
// blockB[count+5] = cj(b0[5]);
|
|
2547
|
+
// blockB[count+6] = cj(b0[6]);
|
|
2548
|
+
// blockB[count+7] = cj(b0[7]);
|
|
2549
|
+
// }
|
|
2550
|
+
// count += 8;
|
|
2551
|
+
// }
|
|
2552
|
+
// // skip what we have after
|
|
2553
|
+
// if(PanelMode) count += 8 * (stride-offset-depth);
|
|
2554
|
+
// }
|
|
2555
|
+
// }
|
|
2556
|
+
if(nr>=4)
|
|
2080
2557
|
{
|
|
2081
|
-
|
|
2082
|
-
if(PanelMode) count += 4 * offset;
|
|
2083
|
-
for(Index k=0; k<depth; k++)
|
|
2558
|
+
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
|
|
2084
2559
|
{
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2560
|
+
// skip what we have before
|
|
2561
|
+
if(PanelMode) count += 4 * offset;
|
|
2562
|
+
for(Index k=0; k<depth; k++)
|
|
2563
|
+
{
|
|
2564
|
+
if (PacketSize==4) {
|
|
2565
|
+
Packet A = rhs.template loadPacket<Packet>(k, j2);
|
|
2566
|
+
pstoreu(blockB+count, cj.pconj(A));
|
|
2567
|
+
count += PacketSize;
|
|
2568
|
+
} else if (HasHalf && HalfPacketSize==4) {
|
|
2569
|
+
HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
|
|
2570
|
+
pstoreu(blockB+count, cj.pconj(A));
|
|
2571
|
+
count += HalfPacketSize;
|
|
2572
|
+
} else if (HasQuarter && QuarterPacketSize==4) {
|
|
2573
|
+
QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
|
|
2574
|
+
pstoreu(blockB+count, cj.pconj(A));
|
|
2575
|
+
count += QuarterPacketSize;
|
|
2576
|
+
} else {
|
|
2577
|
+
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
|
|
2578
|
+
blockB[count+0] = cj(dm0(0));
|
|
2579
|
+
blockB[count+1] = cj(dm0(1));
|
|
2580
|
+
blockB[count+2] = cj(dm0(2));
|
|
2581
|
+
blockB[count+3] = cj(dm0(3));
|
|
2582
|
+
count += 4;
|
|
2583
|
+
}
|
|
2096
2584
|
}
|
|
2585
|
+
// skip what we have after
|
|
2586
|
+
if(PanelMode) count += 4 * (stride-offset-depth);
|
|
2097
2587
|
}
|
|
2098
|
-
// skip what we have after
|
|
2099
|
-
if(PanelMode) count += 4 * (stride-offset-depth);
|
|
2100
2588
|
}
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
for(Index j2=packet_cols4; j2<cols; ++j2)
|
|
2104
|
-
{
|
|
2105
|
-
if(PanelMode) count += offset;
|
|
2106
|
-
for(Index k=0; k<depth; k++)
|
|
2589
|
+
// copy the remaining columns one at a time (nr==1)
|
|
2590
|
+
for(Index j2=packet_cols4; j2<cols; ++j2)
|
|
2107
2591
|
{
|
|
2108
|
-
|
|
2109
|
-
|
|
2592
|
+
if(PanelMode) count += offset;
|
|
2593
|
+
for(Index k=0; k<depth; k++)
|
|
2594
|
+
{
|
|
2595
|
+
blockB[count] = cj(rhs(k, j2));
|
|
2596
|
+
count += 1;
|
|
2597
|
+
}
|
|
2598
|
+
if(PanelMode) count += stride-offset-depth;
|
|
2110
2599
|
}
|
|
2111
|
-
if(PanelMode) count += stride-offset-depth;
|
|
2112
2600
|
}
|
|
2113
|
-
}
|
|
2601
|
+
};
|
|
2114
2602
|
|
|
2115
2603
|
} // end namespace internal
|
|
2116
2604
|
|