@smake/eigen 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/COPYING.APACHE +203 -0
- package/eigen/COPYING.BSD +1 -1
- package/eigen/COPYING.MINPACK +51 -52
- package/eigen/Eigen/Cholesky +0 -1
- package/eigen/Eigen/Core +108 -266
- package/eigen/Eigen/Eigenvalues +0 -1
- package/eigen/Eigen/Geometry +3 -6
- package/eigen/Eigen/Householder +0 -1
- package/eigen/Eigen/Jacobi +0 -1
- package/eigen/Eigen/KLUSupport +41 -0
- package/eigen/Eigen/LU +2 -5
- package/eigen/Eigen/OrderingMethods +0 -3
- package/eigen/Eigen/PaStiXSupport +1 -0
- package/eigen/Eigen/PardisoSupport +0 -0
- package/eigen/Eigen/QR +0 -1
- package/eigen/Eigen/QtAlignedMalloc +0 -1
- package/eigen/Eigen/SVD +0 -1
- package/eigen/Eigen/Sparse +0 -2
- package/eigen/Eigen/SparseCholesky +0 -8
- package/eigen/Eigen/SparseLU +4 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
- package/eigen/Eigen/src/Cholesky/LLT.h +39 -23
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
- package/eigen/Eigen/src/Core/Array.h +99 -11
- package/eigen/Eigen/src/Core/ArrayBase.h +1 -1
- package/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
- package/eigen/Eigen/src/Core/Assign.h +1 -1
- package/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
- package/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
- package/eigen/Eigen/src/Core/BandMatrix.h +16 -16
- package/eigen/Eigen/src/Core/Block.h +56 -60
- package/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
- package/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
- package/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
- package/eigen/Eigen/src/Core/CoreIterators.h +5 -0
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +12 -10
- package/eigen/Eigen/src/Core/DenseBase.h +128 -39
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
- package/eigen/Eigen/src/Core/DenseStorage.h +150 -68
- package/eigen/Eigen/src/Core/Diagonal.h +21 -23
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
- package/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
- package/eigen/Eigen/src/Core/Dot.h +10 -10
- package/eigen/Eigen/src/Core/EigenBase.h +10 -9
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
- package/eigen/Eigen/src/Core/Fuzzy.h +3 -3
- package/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
- package/eigen/Eigen/src/Core/GenericPacketMath.h +597 -147
- package/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
- package/eigen/Eigen/src/Core/IO.h +40 -7
- package/eigen/Eigen/src/Core/IndexedView.h +237 -0
- package/eigen/Eigen/src/Core/Inverse.h +9 -10
- package/eigen/Eigen/src/Core/Map.h +7 -7
- package/eigen/Eigen/src/Core/MapBase.h +5 -3
- package/eigen/Eigen/src/Core/MathFunctions.h +756 -120
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
- package/eigen/Eigen/src/Core/Matrix.h +131 -25
- package/eigen/Eigen/src/Core/MatrixBase.h +19 -2
- package/eigen/Eigen/src/Core/NestByValue.h +25 -50
- package/eigen/Eigen/src/Core/NoAlias.h +4 -3
- package/eigen/Eigen/src/Core/NumTraits.h +107 -20
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +3 -3
- package/eigen/Eigen/src/Core/PlainObjectBase.h +145 -54
- package/eigen/Eigen/src/Core/Product.h +30 -25
- package/eigen/Eigen/src/Core/ProductEvaluators.h +183 -142
- package/eigen/Eigen/src/Core/Random.h +37 -1
- package/eigen/Eigen/src/Core/Redux.h +180 -170
- package/eigen/Eigen/src/Core/Ref.h +118 -21
- package/eigen/Eigen/src/Core/Replicate.h +8 -8
- package/eigen/Eigen/src/Core/Reshaped.h +454 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
- package/eigen/Eigen/src/Core/Reverse.h +18 -12
- package/eigen/Eigen/src/Core/Select.h +8 -6
- package/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
- package/eigen/Eigen/src/Core/Solve.h +14 -14
- package/eigen/Eigen/src/Core/SolveTriangular.h +13 -13
- package/eigen/Eigen/src/Core/SolverBase.h +41 -3
- package/eigen/Eigen/src/Core/StableNorm.h +100 -70
- package/eigen/Eigen/src/Core/StlIterators.h +463 -0
- package/eigen/Eigen/src/Core/Stride.h +9 -4
- package/eigen/Eigen/src/Core/Swap.h +5 -4
- package/eigen/Eigen/src/Core/Transpose.h +86 -27
- package/eigen/Eigen/src/Core/Transpositions.h +26 -8
- package/eigen/Eigen/src/Core/TriangularMatrix.h +88 -72
- package/eigen/Eigen/src/Core/VectorBlock.h +5 -5
- package/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
- package/eigen/Eigen/src/Core/Visitor.h +137 -29
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +186 -213
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1250 -252
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
- package/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +354 -15
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1073 -585
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +29 -7
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +4 -4
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +1 -1
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
- package/eigen/Eigen/src/Core/products/Parallelizer.h +23 -9
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +23 -6
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +2 -2
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +3 -3
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +5 -3
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
- package/eigen/Eigen/src/Core/util/BlasUtil.h +208 -124
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
- package/eigen/Eigen/src/Core/util/Constants.h +25 -9
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +14 -2
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +28 -4
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
- package/eigen/Eigen/src/Core/util/Macros.h +661 -250
- package/eigen/Eigen/src/Core/util/Memory.h +222 -52
- package/eigen/Eigen/src/Core/util/Meta.h +349 -105
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +48 -30
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +10 -5
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +75 -42
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
- package/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
- package/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
- package/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
- package/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
- package/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
- package/eigen/Eigen/src/Geometry/Quaternion.h +52 -14
- package/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
- package/eigen/Eigen/src/Geometry/Scaling.h +22 -4
- package/eigen/Eigen/src/Geometry/Transform.h +86 -65
- package/eigen/Eigen/src/Geometry/Translation.h +6 -6
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
- package/eigen/Eigen/src/Householder/Householder.h +8 -4
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
- package/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
- package/eigen/Eigen/src/LU/Determinant.h +35 -19
- package/eigen/Eigen/src/LU/FullPivLU.h +29 -43
- package/eigen/Eigen/src/LU/InverseImpl.h +25 -8
- package/eigen/Eigen/src/LU/PartialPivLU.h +67 -57
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
- package/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +10 -9
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
- package/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
- package/eigen/Eigen/src/SVD/BDCSVD.h +137 -48
- package/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
- package/eigen/Eigen/src/SVD/SVDBase.h +82 -21
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +16 -8
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +11 -36
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +2 -2
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +124 -10
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
- package/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
- package/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU.h +160 -10
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
- package/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
- package/eigen/Eigen/src/StlSupport/StdDeque.h +2 -14
- package/eigen/Eigen/src/StlSupport/StdList.h +2 -2
- package/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
- package/eigen/Eigen/src/misc/lapacke.h +5 -4
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +27 -1
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
- package/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
- package/eigen/README.md +2 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// This file is part of Eigen, a lightweight C++ template library
|
|
2
2
|
// for linear algebra.
|
|
3
3
|
//
|
|
4
|
-
// Copyright (C) 2008-
|
|
4
|
+
// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
|
|
5
5
|
//
|
|
6
6
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
7
7
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
@@ -14,11 +14,57 @@ namespace Eigen {
|
|
|
14
14
|
|
|
15
15
|
namespace internal {
|
|
16
16
|
|
|
17
|
+
enum GEMVPacketSizeType {
|
|
18
|
+
GEMVPacketFull = 0,
|
|
19
|
+
GEMVPacketHalf,
|
|
20
|
+
GEMVPacketQuarter
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
template <int N, typename T1, typename T2, typename T3>
|
|
24
|
+
struct gemv_packet_cond { typedef T3 type; };
|
|
25
|
+
|
|
26
|
+
template <typename T1, typename T2, typename T3>
|
|
27
|
+
struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
|
|
28
|
+
|
|
29
|
+
template <typename T1, typename T2, typename T3>
|
|
30
|
+
struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
|
|
31
|
+
|
|
32
|
+
template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
|
|
33
|
+
class gemv_traits
|
|
34
|
+
{
|
|
35
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
36
|
+
|
|
37
|
+
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
|
|
38
|
+
typedef typename gemv_packet_cond<packet_size, \
|
|
39
|
+
typename packet_traits<name ## Scalar>::type, \
|
|
40
|
+
typename packet_traits<name ## Scalar>::half, \
|
|
41
|
+
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
|
|
42
|
+
prefix ## name ## Packet
|
|
43
|
+
|
|
44
|
+
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
|
|
45
|
+
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
|
|
46
|
+
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
|
|
47
|
+
#undef PACKET_DECL_COND_PREFIX
|
|
48
|
+
|
|
49
|
+
public:
|
|
50
|
+
enum {
|
|
51
|
+
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
|
|
52
|
+
unpacket_traits<_RhsPacket>::vectorizable &&
|
|
53
|
+
int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
|
|
54
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
55
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
|
|
56
|
+
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
|
60
|
+
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
|
61
|
+
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
|
|
17
65
|
/* Optimized col-major matrix * vector product:
|
|
18
|
-
* This algorithm processes
|
|
19
|
-
*
|
|
20
|
-
* the instruction dependency. Moreover, we know that all bands have the
|
|
21
|
-
* same alignment pattern.
|
|
66
|
+
* This algorithm processes the matrix per vertical panels,
|
|
67
|
+
* which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
|
|
22
68
|
*
|
|
23
69
|
* Mixing type logic: C += alpha * A * B
|
|
24
70
|
* | A | B |alpha| comments
|
|
@@ -27,56 +73,30 @@ namespace internal {
|
|
|
27
73
|
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
|
28
74
|
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
|
29
75
|
*
|
|
30
|
-
* Accesses to the matrix coefficients follow the following logic:
|
|
31
|
-
*
|
|
32
|
-
* - if all columns have the same alignment then
|
|
33
|
-
* - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
|
|
34
|
-
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
|
35
|
-
* - otherwise
|
|
36
|
-
* - if even columns have the same alignment then
|
|
37
|
-
* // odd columns are guaranteed to have the same alignment too
|
|
38
|
-
* - if even or odd columns have the same alignment as the result, then
|
|
39
|
-
* // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
|
|
40
|
-
* - perform half aligned and half unaligned loads (-> EvenAligned case)
|
|
41
|
-
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
|
42
|
-
* - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
|
|
43
|
-
* - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
|
|
44
|
-
* perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
|
|
45
|
-
* // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
|
|
46
|
-
* - otherwise,
|
|
47
|
-
* // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
|
|
48
|
-
* // we currently fall back to the NoneAligned case
|
|
49
|
-
*
|
|
50
76
|
* The same reasoning apply for the transposed case.
|
|
51
|
-
*
|
|
52
|
-
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
|
|
53
|
-
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
|
|
54
|
-
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
|
|
55
|
-
* compared to unaligned loads on a 4 byte boundary.
|
|
56
|
-
*
|
|
57
77
|
*/
|
|
58
78
|
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
59
79
|
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
|
60
80
|
{
|
|
81
|
+
typedef gemv_traits<LhsScalar,RhsScalar> Traits;
|
|
82
|
+
typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
|
|
83
|
+
typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
|
|
84
|
+
|
|
61
85
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
62
86
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
67
|
-
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
68
|
-
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
69
|
-
};
|
|
87
|
+
typedef typename Traits::LhsPacket LhsPacket;
|
|
88
|
+
typedef typename Traits::RhsPacket RhsPacket;
|
|
89
|
+
typedef typename Traits::ResPacket ResPacket;
|
|
70
90
|
|
|
71
|
-
typedef typename
|
|
72
|
-
typedef typename
|
|
73
|
-
typedef typename
|
|
91
|
+
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
|
|
92
|
+
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
|
|
93
|
+
typedef typename HalfTraits::ResPacket ResPacketHalf;
|
|
74
94
|
|
|
75
|
-
typedef typename
|
|
76
|
-
typedef typename
|
|
77
|
-
typedef typename
|
|
95
|
+
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
|
|
96
|
+
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
97
|
+
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
78
98
|
|
|
79
|
-
EIGEN_DONT_INLINE static void run(
|
|
99
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
|
|
80
100
|
Index rows, Index cols,
|
|
81
101
|
const LhsMapper& lhs,
|
|
82
102
|
const RhsMapper& rhs,
|
|
@@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run(
|
|
|
85
105
|
};
|
|
86
106
|
|
|
87
107
|
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
88
|
-
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
|
108
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
|
89
109
|
Index rows, Index cols,
|
|
90
|
-
const LhsMapper&
|
|
110
|
+
const LhsMapper& alhs,
|
|
91
111
|
const RhsMapper& rhs,
|
|
92
112
|
ResScalar* res, Index resIncr,
|
|
93
113
|
RhsScalar alpha)
|
|
94
114
|
{
|
|
95
115
|
EIGEN_UNUSED_VARIABLE(resIncr);
|
|
96
116
|
eigen_internal_assert(resIncr==1);
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
pstore(&res[j], \
|
|
102
|
-
padd(pload<ResPacket>(&res[j]), \
|
|
103
|
-
padd( \
|
|
104
|
-
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
|
|
105
|
-
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
|
|
106
|
-
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
|
|
107
|
-
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
|
|
108
|
-
|
|
109
|
-
typedef typename LhsMapper::VectorMapper LhsScalars;
|
|
117
|
+
|
|
118
|
+
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
119
|
+
// This helps GCC to generate propoer code.
|
|
120
|
+
LhsMapper lhs(alhs);
|
|
110
121
|
|
|
111
122
|
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
112
123
|
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
|
117
|
-
const Index columnsAtOnce = 4;
|
|
118
|
-
const Index peels = 2;
|
|
119
|
-
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
120
|
-
const Index ResPacketAlignedMask = ResPacketSize-1;
|
|
121
|
-
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
|
122
|
-
const Index size = rows;
|
|
124
|
+
conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
|
|
125
|
+
conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
|
|
123
126
|
|
|
124
127
|
const Index lhsStride = lhs.stride();
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
const Index
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
Index
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
{
|
|
151
|
-
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
|
152
|
-
// Currently, it seems to be better to perform unaligned loads anyway
|
|
153
|
-
alignmentPattern = NoneAligned;
|
|
154
|
-
}
|
|
155
|
-
else if (LhsPacketSize>1)
|
|
128
|
+
// TODO: for padded aligned inputs, we could enable aligned reads
|
|
129
|
+
enum { LhsAlignment = Unaligned,
|
|
130
|
+
ResPacketSize = Traits::ResPacketSize,
|
|
131
|
+
ResPacketSizeHalf = HalfTraits::ResPacketSize,
|
|
132
|
+
ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
|
|
133
|
+
LhsPacketSize = Traits::LhsPacketSize,
|
|
134
|
+
HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
|
|
135
|
+
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
const Index n8 = rows-8*ResPacketSize+1;
|
|
139
|
+
const Index n4 = rows-4*ResPacketSize+1;
|
|
140
|
+
const Index n3 = rows-3*ResPacketSize+1;
|
|
141
|
+
const Index n2 = rows-2*ResPacketSize+1;
|
|
142
|
+
const Index n1 = rows-1*ResPacketSize+1;
|
|
143
|
+
const Index n_half = rows-1*ResPacketSizeHalf+1;
|
|
144
|
+
const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
|
|
145
|
+
|
|
146
|
+
// TODO: improve the following heuristic:
|
|
147
|
+
const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
|
|
148
|
+
ResPacket palpha = pset1<ResPacket>(alpha);
|
|
149
|
+
ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
|
|
150
|
+
ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
|
|
151
|
+
|
|
152
|
+
for(Index j2=0; j2<cols; j2+=block_cols)
|
|
156
153
|
{
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
|
161
|
-
++skipColumns;
|
|
162
|
-
if (skipColumns==LhsPacketSize)
|
|
154
|
+
Index jend = numext::mini(j2+block_cols,cols);
|
|
155
|
+
Index i=0;
|
|
156
|
+
for(; i<n8; i+=ResPacketSize*8)
|
|
163
157
|
{
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
158
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
159
|
+
c1 = pset1<ResPacket>(ResScalar(0)),
|
|
160
|
+
c2 = pset1<ResPacket>(ResScalar(0)),
|
|
161
|
+
c3 = pset1<ResPacket>(ResScalar(0)),
|
|
162
|
+
c4 = pset1<ResPacket>(ResScalar(0)),
|
|
163
|
+
c5 = pset1<ResPacket>(ResScalar(0)),
|
|
164
|
+
c6 = pset1<ResPacket>(ResScalar(0)),
|
|
165
|
+
c7 = pset1<ResPacket>(ResScalar(0));
|
|
166
|
+
|
|
167
|
+
for(Index j=j2; j<jend; j+=1)
|
|
168
|
+
{
|
|
169
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
|
|
170
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
|
|
171
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
172
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
|
|
173
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
|
|
174
|
+
c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
|
|
175
|
+
c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
|
|
176
|
+
c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
|
|
177
|
+
c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
|
|
178
|
+
}
|
|
179
|
+
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
180
|
+
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
181
|
+
pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
|
|
182
|
+
pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
|
|
183
|
+
pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
|
|
184
|
+
pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
|
|
185
|
+
pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
|
|
186
|
+
pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
|
|
167
187
|
}
|
|
168
|
-
|
|
188
|
+
if(i<n4)
|
|
169
189
|
{
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
190
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
191
|
+
c1 = pset1<ResPacket>(ResScalar(0)),
|
|
192
|
+
c2 = pset1<ResPacket>(ResScalar(0)),
|
|
193
|
+
c3 = pset1<ResPacket>(ResScalar(0));
|
|
173
194
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
|
187
|
-
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
|
195
|
+
for(Index j=j2; j<jend; j+=1)
|
|
196
|
+
{
|
|
197
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
|
|
198
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
|
|
199
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
200
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
|
|
201
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
|
|
202
|
+
}
|
|
203
|
+
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
204
|
+
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
205
|
+
pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
|
|
206
|
+
pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
|
|
188
207
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
208
|
+
i+=ResPacketSize*4;
|
|
209
|
+
}
|
|
210
|
+
if(i<n3)
|
|
211
|
+
{
|
|
212
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
213
|
+
c1 = pset1<ResPacket>(ResScalar(0)),
|
|
214
|
+
c2 = pset1<ResPacket>(ResScalar(0));
|
|
196
215
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
216
|
+
for(Index j=j2; j<jend; j+=1)
|
|
217
|
+
{
|
|
218
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
|
|
219
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
|
|
220
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
221
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
|
|
222
|
+
}
|
|
223
|
+
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
224
|
+
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
225
|
+
pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
|
|
200
226
|
|
|
201
|
-
|
|
227
|
+
i+=ResPacketSize*3;
|
|
228
|
+
}
|
|
229
|
+
if(i<n2)
|
|
202
230
|
{
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
231
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
232
|
+
c1 = pset1<ResPacket>(ResScalar(0));
|
|
233
|
+
|
|
234
|
+
for(Index j=j2; j<jend; j+=1)
|
|
206
235
|
{
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
|
236
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
|
|
237
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
|
|
238
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
211
239
|
}
|
|
212
|
-
|
|
213
|
-
|
|
240
|
+
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
241
|
+
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
242
|
+
i+=ResPacketSize*2;
|
|
243
|
+
}
|
|
244
|
+
if(i<n1)
|
|
245
|
+
{
|
|
246
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0));
|
|
247
|
+
for(Index j=j2; j<jend; j+=1)
|
|
214
248
|
{
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
case AllAligned:
|
|
218
|
-
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
219
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
|
220
|
-
break;
|
|
221
|
-
case EvenAligned:
|
|
222
|
-
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
223
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
|
224
|
-
break;
|
|
225
|
-
case FirstAligned:
|
|
226
|
-
{
|
|
227
|
-
Index j = alignedStart;
|
|
228
|
-
if(peels>1)
|
|
229
|
-
{
|
|
230
|
-
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
|
231
|
-
ResPacket T0, T1;
|
|
232
|
-
|
|
233
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
|
234
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
|
235
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
|
236
|
-
|
|
237
|
-
for (; j<peeledSize; j+=peels*ResPacketSize)
|
|
238
|
-
{
|
|
239
|
-
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
|
240
|
-
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
|
241
|
-
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
|
242
|
-
|
|
243
|
-
A00 = lhs0.template load<LhsPacket, Aligned>(j);
|
|
244
|
-
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
|
|
245
|
-
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
|
246
|
-
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
|
247
|
-
|
|
248
|
-
T0 = pcj.pmadd(A01, ptmp1, T0);
|
|
249
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
|
250
|
-
T0 = pcj.pmadd(A02, ptmp2, T0);
|
|
251
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
|
252
|
-
T0 = pcj.pmadd(A03, ptmp3, T0);
|
|
253
|
-
pstore(&res[j],T0);
|
|
254
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
|
255
|
-
T1 = pcj.pmadd(A11, ptmp1, T1);
|
|
256
|
-
T1 = pcj.pmadd(A12, ptmp2, T1);
|
|
257
|
-
T1 = pcj.pmadd(A13, ptmp3, T1);
|
|
258
|
-
pstore(&res[j+ResPacketSize],T1);
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
for (; j<alignedSize; j+=ResPacketSize)
|
|
262
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
|
263
|
-
break;
|
|
264
|
-
}
|
|
265
|
-
default:
|
|
266
|
-
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
267
|
-
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
|
268
|
-
break;
|
|
269
|
-
}
|
|
249
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
|
|
250
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
270
251
|
}
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
252
|
+
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
253
|
+
i+=ResPacketSize;
|
|
254
|
+
}
|
|
255
|
+
if(HasHalf && i<n_half)
|
|
275
256
|
{
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
257
|
+
ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
|
|
258
|
+
for(Index j=j2; j<jend; j+=1)
|
|
259
|
+
{
|
|
260
|
+
RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
|
|
261
|
+
c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
|
|
262
|
+
}
|
|
263
|
+
pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
|
|
264
|
+
i+=ResPacketSizeHalf;
|
|
280
265
|
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
// process remaining first and last columns (at most columnsAtOnce-1)
|
|
284
|
-
Index end = cols;
|
|
285
|
-
Index start = columnBound;
|
|
286
|
-
do
|
|
287
|
-
{
|
|
288
|
-
for (Index k=start; k<end; ++k)
|
|
266
|
+
if(HasQuarter && i<n_quarter)
|
|
289
267
|
{
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
if (Vectorizable)
|
|
268
|
+
ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
|
|
269
|
+
for(Index j=j2; j<jend; j+=1)
|
|
294
270
|
{
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
for (Index j=0; j<alignedStart; ++j)
|
|
298
|
-
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
|
|
299
|
-
// process aligned result's coeffs
|
|
300
|
-
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
|
301
|
-
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
302
|
-
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
|
303
|
-
else
|
|
304
|
-
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
305
|
-
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
|
271
|
+
RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
|
|
272
|
+
c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
|
|
306
273
|
}
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
for (Index i=alignedSize; i<size; ++i)
|
|
310
|
-
res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
|
|
274
|
+
pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
|
|
275
|
+
i+=ResPacketSizeQuarter;
|
|
311
276
|
}
|
|
312
|
-
|
|
277
|
+
for(;i<rows;++i)
|
|
313
278
|
{
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
279
|
+
ResScalar c0(0);
|
|
280
|
+
for(Index j=j2; j<jend; j+=1)
|
|
281
|
+
c0 += cj.pmul(lhs(i,j), rhs(j,0));
|
|
282
|
+
res[i] += alpha*c0;
|
|
317
283
|
}
|
|
318
|
-
|
|
319
|
-
break;
|
|
320
|
-
} while(Vectorizable);
|
|
321
|
-
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
284
|
+
}
|
|
322
285
|
}
|
|
323
286
|
|
|
324
287
|
/* Optimized row-major matrix * vector product:
|
|
325
|
-
* This algorithm processes 4 rows at
|
|
288
|
+
* This algorithm processes 4 rows at once that allows to both reduce
|
|
326
289
|
* the number of load/stores of the result by a factor 4 and to reduce
|
|
327
290
|
* the instruction dependency. Moreover, we know that all bands have the
|
|
328
291
|
* same alignment pattern.
|
|
@@ -334,25 +297,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
|
|
|
334
297
|
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
335
298
|
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
|
336
299
|
{
|
|
337
|
-
typedef
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
300
|
+
typedef gemv_traits<LhsScalar,RhsScalar> Traits;
|
|
301
|
+
typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
|
|
302
|
+
typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
|
|
303
|
+
|
|
304
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
305
|
+
|
|
306
|
+
typedef typename Traits::LhsPacket LhsPacket;
|
|
307
|
+
typedef typename Traits::RhsPacket RhsPacket;
|
|
308
|
+
typedef typename Traits::ResPacket ResPacket;
|
|
346
309
|
|
|
347
|
-
typedef typename
|
|
348
|
-
typedef typename
|
|
349
|
-
typedef typename
|
|
310
|
+
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
|
|
311
|
+
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
|
|
312
|
+
typedef typename HalfTraits::ResPacket ResPacketHalf;
|
|
350
313
|
|
|
351
|
-
typedef typename
|
|
352
|
-
typedef typename
|
|
353
|
-
typedef typename
|
|
314
|
+
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
|
|
315
|
+
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
316
|
+
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
354
317
|
|
|
355
|
-
EIGEN_DONT_INLINE static void run(
|
|
318
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
|
|
356
319
|
Index rows, Index cols,
|
|
357
320
|
const LhsMapper& lhs,
|
|
358
321
|
const RhsMapper& rhs,
|
|
@@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run(
|
|
|
361
324
|
};
|
|
362
325
|
|
|
363
326
|
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
|
364
|
-
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
|
327
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
|
365
328
|
Index rows, Index cols,
|
|
366
|
-
const LhsMapper&
|
|
329
|
+
const LhsMapper& alhs,
|
|
367
330
|
const RhsMapper& rhs,
|
|
368
331
|
ResScalar* res, Index resIncr,
|
|
369
332
|
ResScalar alpha)
|
|
370
333
|
{
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
375
|
-
#endif
|
|
376
|
-
|
|
377
|
-
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
|
|
378
|
-
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
|
|
379
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
|
|
380
|
-
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
|
|
381
|
-
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
|
|
382
|
-
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
|
|
334
|
+
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
335
|
+
// This helps GCC to generate propoer code.
|
|
336
|
+
LhsMapper lhs(alhs);
|
|
383
337
|
|
|
338
|
+
eigen_internal_assert(rhs.stride()==1);
|
|
384
339
|
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
385
340
|
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
const Index
|
|
392
|
-
const Index
|
|
393
|
-
const Index
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
// we cannot assume the first element is aligned because of sub-matrices
|
|
410
|
-
const Index lhsAlignmentOffset = lhs.firstAligned(depth);
|
|
411
|
-
const Index rhsAlignmentOffset = rhs.firstAligned(rows);
|
|
412
|
-
|
|
413
|
-
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
|
414
|
-
Index skipRows = 0;
|
|
415
|
-
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
|
416
|
-
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
|
|
417
|
-
(lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
|
|
418
|
-
(rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
|
|
419
|
-
{
|
|
420
|
-
alignedSize = 0;
|
|
421
|
-
alignedStart = 0;
|
|
422
|
-
alignmentPattern = NoneAligned;
|
|
423
|
-
}
|
|
424
|
-
else if(LhsPacketSize > 4)
|
|
425
|
-
{
|
|
426
|
-
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
|
427
|
-
alignmentPattern = NoneAligned;
|
|
428
|
-
}
|
|
429
|
-
else if (LhsPacketSize>1)
|
|
341
|
+
conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
|
|
342
|
+
conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
|
|
343
|
+
|
|
344
|
+
// TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
|
|
345
|
+
// processing 8 rows at once might be counter productive wrt cache.
|
|
346
|
+
const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
|
|
347
|
+
const Index n4 = rows-3;
|
|
348
|
+
const Index n2 = rows-1;
|
|
349
|
+
|
|
350
|
+
// TODO: for padded aligned inputs, we could enable aligned reads
|
|
351
|
+
enum { LhsAlignment = Unaligned,
|
|
352
|
+
ResPacketSize = Traits::ResPacketSize,
|
|
353
|
+
ResPacketSizeHalf = HalfTraits::ResPacketSize,
|
|
354
|
+
ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
|
|
355
|
+
LhsPacketSize = Traits::LhsPacketSize,
|
|
356
|
+
LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
|
|
357
|
+
LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
|
|
358
|
+
HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
|
|
359
|
+
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
|
|
360
|
+
};
|
|
361
|
+
|
|
362
|
+
Index i=0;
|
|
363
|
+
for(; i<n8; i+=8)
|
|
430
364
|
{
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
365
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
366
|
+
c1 = pset1<ResPacket>(ResScalar(0)),
|
|
367
|
+
c2 = pset1<ResPacket>(ResScalar(0)),
|
|
368
|
+
c3 = pset1<ResPacket>(ResScalar(0)),
|
|
369
|
+
c4 = pset1<ResPacket>(ResScalar(0)),
|
|
370
|
+
c5 = pset1<ResPacket>(ResScalar(0)),
|
|
371
|
+
c6 = pset1<ResPacket>(ResScalar(0)),
|
|
372
|
+
c7 = pset1<ResPacket>(ResScalar(0));
|
|
373
|
+
|
|
374
|
+
Index j=0;
|
|
375
|
+
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
|
|
437
376
|
{
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
377
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
|
|
378
|
+
|
|
379
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
380
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
|
|
381
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
|
|
382
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
|
|
383
|
+
c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
|
|
384
|
+
c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
|
|
385
|
+
c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
|
|
386
|
+
c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
|
|
441
387
|
}
|
|
442
|
-
|
|
388
|
+
ResScalar cc0 = predux(c0);
|
|
389
|
+
ResScalar cc1 = predux(c1);
|
|
390
|
+
ResScalar cc2 = predux(c2);
|
|
391
|
+
ResScalar cc3 = predux(c3);
|
|
392
|
+
ResScalar cc4 = predux(c4);
|
|
393
|
+
ResScalar cc5 = predux(c5);
|
|
394
|
+
ResScalar cc6 = predux(c6);
|
|
395
|
+
ResScalar cc7 = predux(c7);
|
|
396
|
+
for(; j<cols; ++j)
|
|
443
397
|
{
|
|
444
|
-
|
|
445
|
-
|
|
398
|
+
RhsScalar b0 = rhs(j,0);
|
|
399
|
+
|
|
400
|
+
cc0 += cj.pmul(lhs(i+0,j), b0);
|
|
401
|
+
cc1 += cj.pmul(lhs(i+1,j), b0);
|
|
402
|
+
cc2 += cj.pmul(lhs(i+2,j), b0);
|
|
403
|
+
cc3 += cj.pmul(lhs(i+3,j), b0);
|
|
404
|
+
cc4 += cj.pmul(lhs(i+4,j), b0);
|
|
405
|
+
cc5 += cj.pmul(lhs(i+5,j), b0);
|
|
406
|
+
cc6 += cj.pmul(lhs(i+6,j), b0);
|
|
407
|
+
cc7 += cj.pmul(lhs(i+7,j), b0);
|
|
446
408
|
}
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
409
|
+
res[(i+0)*resIncr] += alpha*cc0;
|
|
410
|
+
res[(i+1)*resIncr] += alpha*cc1;
|
|
411
|
+
res[(i+2)*resIncr] += alpha*cc2;
|
|
412
|
+
res[(i+3)*resIncr] += alpha*cc3;
|
|
413
|
+
res[(i+4)*resIncr] += alpha*cc4;
|
|
414
|
+
res[(i+5)*resIncr] += alpha*cc5;
|
|
415
|
+
res[(i+6)*resIncr] += alpha*cc6;
|
|
416
|
+
res[(i+7)*resIncr] += alpha*cc7;
|
|
452
417
|
}
|
|
453
|
-
|
|
418
|
+
for(; i<n4; i+=4)
|
|
454
419
|
{
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
|
|
461
|
-
const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
|
|
420
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
421
|
+
c1 = pset1<ResPacket>(ResScalar(0)),
|
|
422
|
+
c2 = pset1<ResPacket>(ResScalar(0)),
|
|
423
|
+
c3 = pset1<ResPacket>(ResScalar(0));
|
|
462
424
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
|
468
|
-
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
|
469
|
-
|
|
470
|
-
// this helps the compiler generating good binary code
|
|
471
|
-
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
|
|
472
|
-
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
|
|
425
|
+
Index j=0;
|
|
426
|
+
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
|
|
427
|
+
{
|
|
428
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
|
|
473
429
|
|
|
474
|
-
|
|
430
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
431
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
|
|
432
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
|
|
433
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
|
|
434
|
+
}
|
|
435
|
+
ResScalar cc0 = predux(c0);
|
|
436
|
+
ResScalar cc1 = predux(c1);
|
|
437
|
+
ResScalar cc2 = predux(c2);
|
|
438
|
+
ResScalar cc3 = predux(c3);
|
|
439
|
+
for(; j<cols; ++j)
|
|
475
440
|
{
|
|
476
|
-
|
|
477
|
-
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
|
478
|
-
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
|
441
|
+
RhsScalar b0 = rhs(j,0);
|
|
479
442
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
443
|
+
cc0 += cj.pmul(lhs(i+0,j), b0);
|
|
444
|
+
cc1 += cj.pmul(lhs(i+1,j), b0);
|
|
445
|
+
cc2 += cj.pmul(lhs(i+2,j), b0);
|
|
446
|
+
cc3 += cj.pmul(lhs(i+3,j), b0);
|
|
447
|
+
}
|
|
448
|
+
res[(i+0)*resIncr] += alpha*cc0;
|
|
449
|
+
res[(i+1)*resIncr] += alpha*cc1;
|
|
450
|
+
res[(i+2)*resIncr] += alpha*cc2;
|
|
451
|
+
res[(i+3)*resIncr] += alpha*cc3;
|
|
452
|
+
}
|
|
453
|
+
for(; i<n2; i+=2)
|
|
454
|
+
{
|
|
455
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
456
|
+
c1 = pset1<ResPacket>(ResScalar(0));
|
|
488
457
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
case AllAligned:
|
|
494
|
-
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
495
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
|
496
|
-
break;
|
|
497
|
-
case EvenAligned:
|
|
498
|
-
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
499
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
|
500
|
-
break;
|
|
501
|
-
case FirstAligned:
|
|
502
|
-
{
|
|
503
|
-
Index j = alignedStart;
|
|
504
|
-
if (peels>1)
|
|
505
|
-
{
|
|
506
|
-
/* Here we proccess 4 rows with with two peeled iterations to hide
|
|
507
|
-
* the overhead of unaligned loads. Moreover unaligned loads are handled
|
|
508
|
-
* using special shift/move operations between the two aligned packets
|
|
509
|
-
* overlaping the desired unaligned packet. This is *much* more efficient
|
|
510
|
-
* than basic unaligned loads.
|
|
511
|
-
*/
|
|
512
|
-
LhsPacket A01, A02, A03, A11, A12, A13;
|
|
513
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
|
514
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
|
515
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
|
516
|
-
|
|
517
|
-
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
|
518
|
-
{
|
|
519
|
-
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
|
|
520
|
-
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
|
521
|
-
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
|
522
|
-
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
|
523
|
-
|
|
524
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
|
|
525
|
-
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
|
526
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
|
527
|
-
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
|
528
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
|
529
|
-
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
|
530
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
|
531
|
-
|
|
532
|
-
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
|
|
533
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
|
|
534
|
-
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
|
535
|
-
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
|
536
|
-
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
|
537
|
-
}
|
|
538
|
-
}
|
|
539
|
-
for (; j<alignedSize; j+=RhsPacketSize)
|
|
540
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
|
541
|
-
break;
|
|
542
|
-
}
|
|
543
|
-
default:
|
|
544
|
-
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
545
|
-
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
|
546
|
-
break;
|
|
547
|
-
}
|
|
548
|
-
tmp0 += predux(ptmp0);
|
|
549
|
-
tmp1 += predux(ptmp1);
|
|
550
|
-
tmp2 += predux(ptmp2);
|
|
551
|
-
tmp3 += predux(ptmp3);
|
|
552
|
-
}
|
|
553
|
-
} // end explicit vectorization
|
|
458
|
+
Index j=0;
|
|
459
|
+
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
|
|
460
|
+
{
|
|
461
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
|
|
554
462
|
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
463
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
464
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
|
|
465
|
+
}
|
|
466
|
+
ResScalar cc0 = predux(c0);
|
|
467
|
+
ResScalar cc1 = predux(c1);
|
|
468
|
+
for(; j<cols; ++j)
|
|
558
469
|
{
|
|
559
|
-
RhsScalar
|
|
560
|
-
|
|
561
|
-
|
|
470
|
+
RhsScalar b0 = rhs(j,0);
|
|
471
|
+
|
|
472
|
+
cc0 += cj.pmul(lhs(i+0,j), b0);
|
|
473
|
+
cc1 += cj.pmul(lhs(i+1,j), b0);
|
|
562
474
|
}
|
|
563
|
-
res[i*resIncr]
|
|
564
|
-
res[(i+
|
|
565
|
-
res[(i+2)*resIncr] += alpha*tmp2;
|
|
566
|
-
res[(i+offset3)*resIncr] += alpha*tmp3;
|
|
475
|
+
res[(i+0)*resIncr] += alpha*cc0;
|
|
476
|
+
res[(i+1)*resIncr] += alpha*cc1;
|
|
567
477
|
}
|
|
568
|
-
|
|
569
|
-
// process remaining first and last rows (at most columnsAtOnce-1)
|
|
570
|
-
Index end = rows;
|
|
571
|
-
Index start = rowBound;
|
|
572
|
-
do
|
|
478
|
+
for(; i<rows; ++i)
|
|
573
479
|
{
|
|
574
|
-
|
|
480
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0));
|
|
481
|
+
ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
|
|
482
|
+
ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
|
|
483
|
+
Index j=0;
|
|
484
|
+
for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
|
|
575
485
|
{
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
|
|
579
|
-
// process first unaligned result's coeffs
|
|
580
|
-
// FIXME this loop get vectorized by the compiler !
|
|
581
|
-
for (Index j=0; j<alignedStart; ++j)
|
|
582
|
-
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
|
583
|
-
|
|
584
|
-
if (alignedSize>alignedStart)
|
|
585
|
-
{
|
|
586
|
-
// process aligned rhs coeffs
|
|
587
|
-
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
|
588
|
-
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
589
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
|
590
|
-
else
|
|
591
|
-
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
592
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
|
593
|
-
tmp0 += predux(ptmp0);
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
// process remaining scalars
|
|
597
|
-
// FIXME this loop get vectorized by the compiler !
|
|
598
|
-
for (Index j=alignedSize; j<depth; ++j)
|
|
599
|
-
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
|
600
|
-
res[i*resIncr] += alpha*tmp0;
|
|
486
|
+
RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
|
|
487
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
|
|
601
488
|
}
|
|
602
|
-
|
|
489
|
+
ResScalar cc0 = predux(c0);
|
|
490
|
+
if (HasHalf) {
|
|
491
|
+
for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
|
|
492
|
+
{
|
|
493
|
+
RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
|
|
494
|
+
c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
|
|
495
|
+
}
|
|
496
|
+
cc0 += predux(c0_h);
|
|
497
|
+
}
|
|
498
|
+
if (HasQuarter) {
|
|
499
|
+
for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
|
|
500
|
+
{
|
|
501
|
+
RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
|
|
502
|
+
c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
|
|
503
|
+
}
|
|
504
|
+
cc0 += predux(c0_q);
|
|
505
|
+
}
|
|
506
|
+
for(; j<cols; ++j)
|
|
603
507
|
{
|
|
604
|
-
|
|
605
|
-
end = skipRows;
|
|
606
|
-
skipRows = 0;
|
|
508
|
+
cc0 += cj.pmul(lhs(i,j), rhs(j,0));
|
|
607
509
|
}
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
} while(Vectorizable);
|
|
611
|
-
|
|
612
|
-
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
510
|
+
res[i*resIncr] += alpha*cc0;
|
|
511
|
+
}
|
|
613
512
|
}
|
|
614
513
|
|
|
615
514
|
} // end namespace internal
|