@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -10,57 +10,57 @@
|
|
|
10
10
|
#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
|
|
11
11
|
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../InternalHeaderCheck.h"
|
|
13
15
|
|
|
14
16
|
namespace Eigen {
|
|
15
17
|
|
|
16
18
|
namespace internal {
|
|
17
19
|
|
|
18
|
-
enum GEBPPacketSizeType {
|
|
19
|
-
GEBPPacketFull = 0,
|
|
20
|
-
GEBPPacketHalf,
|
|
21
|
-
GEBPPacketQuarter
|
|
22
|
-
};
|
|
20
|
+
enum GEBPPacketSizeType { GEBPPacketFull = 0, GEBPPacketHalf, GEBPPacketQuarter };
|
|
23
21
|
|
|
24
|
-
template<typename
|
|
22
|
+
template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_ = false, bool ConjRhs_ = false,
|
|
23
|
+
int Arch = Architecture::Target, int PacketSize_ = GEBPPacketFull>
|
|
25
24
|
class gebp_traits;
|
|
26
25
|
|
|
27
|
-
|
|
28
26
|
/** \internal \returns b if a<=0, and returns a otherwise. */
|
|
29
|
-
inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
|
|
30
|
-
{
|
|
31
|
-
return a<=0 ? b : a;
|
|
32
|
-
}
|
|
27
|
+
inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { return a <= 0 ? b : a; }
|
|
33
28
|
|
|
34
29
|
#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
|
|
35
30
|
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
|
|
36
31
|
#else
|
|
37
32
|
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
|
|
38
|
-
#endif
|
|
33
|
+
#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
|
|
39
34
|
|
|
40
35
|
#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
|
|
41
36
|
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
|
|
42
37
|
#else
|
|
43
38
|
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
|
|
44
|
-
#endif
|
|
39
|
+
#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
|
|
45
40
|
|
|
46
41
|
#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
|
|
47
42
|
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
|
|
48
43
|
#else
|
|
49
44
|
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
|
|
50
|
-
#endif
|
|
51
|
-
|
|
45
|
+
#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
|
|
46
|
+
|
|
52
47
|
#if EIGEN_ARCH_i386_OR_x86_64
|
|
53
|
-
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
|
|
54
|
-
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
|
|
55
|
-
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
|
|
48
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32 * 1024);
|
|
49
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256 * 1024);
|
|
50
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2 * 1024 * 1024);
|
|
56
51
|
#elif EIGEN_ARCH_PPC
|
|
57
|
-
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
|
|
58
|
-
|
|
59
|
-
const std::ptrdiff_t
|
|
52
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
|
|
53
|
+
#ifdef _ARCH_PWR10
|
|
54
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2 * 1024 * 1024);
|
|
55
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 1024 * 1024);
|
|
56
|
+
#else
|
|
57
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
|
|
58
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
|
|
59
|
+
#endif
|
|
60
60
|
#else
|
|
61
|
-
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
|
|
62
|
-
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
|
|
63
|
-
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
|
|
61
|
+
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
|
|
62
|
+
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
|
|
63
|
+
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 * 1024);
|
|
64
64
|
#endif
|
|
65
65
|
|
|
66
66
|
#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
|
|
@@ -69,7 +69,7 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*10
|
|
|
69
69
|
|
|
70
70
|
/** \internal */
|
|
71
71
|
struct CacheSizes {
|
|
72
|
-
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
|
|
72
|
+
CacheSizes() : m_l1(-1), m_l2(-1), m_l3(-1) {
|
|
73
73
|
int l1CacheSize, l2CacheSize, l3CacheSize;
|
|
74
74
|
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
|
|
75
75
|
m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
|
|
@@ -83,27 +83,21 @@ struct CacheSizes {
|
|
|
83
83
|
};
|
|
84
84
|
|
|
85
85
|
/** \internal */
|
|
86
|
-
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
|
|
87
|
-
{
|
|
86
|
+
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) {
|
|
88
87
|
static CacheSizes m_cacheSizes;
|
|
89
88
|
|
|
90
|
-
if(action==SetAction)
|
|
91
|
-
{
|
|
89
|
+
if (action == SetAction) {
|
|
92
90
|
// set the cpu cache size and cache all block sizes from a global cache size in byte
|
|
93
|
-
eigen_internal_assert(l1!=0 && l2!=0);
|
|
91
|
+
eigen_internal_assert(l1 != 0 && l2 != 0);
|
|
94
92
|
m_cacheSizes.m_l1 = *l1;
|
|
95
93
|
m_cacheSizes.m_l2 = *l2;
|
|
96
94
|
m_cacheSizes.m_l3 = *l3;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
{
|
|
100
|
-
eigen_internal_assert(l1!=0 && l2!=0);
|
|
95
|
+
} else if (action == GetAction) {
|
|
96
|
+
eigen_internal_assert(l1 != 0 && l2 != 0);
|
|
101
97
|
*l1 = m_cacheSizes.m_l1;
|
|
102
98
|
*l2 = m_cacheSizes.m_l2;
|
|
103
99
|
*l3 = m_cacheSizes.m_l3;
|
|
104
|
-
}
|
|
105
|
-
else
|
|
106
|
-
{
|
|
100
|
+
} else {
|
|
107
101
|
eigen_internal_assert(false);
|
|
108
102
|
}
|
|
109
103
|
}
|
|
@@ -120,10 +114,9 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
|
|
|
120
114
|
*
|
|
121
115
|
* \sa setCpuCacheSizes */
|
|
122
116
|
|
|
123
|
-
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
|
124
|
-
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
|
|
125
|
-
|
|
126
|
-
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
|
117
|
+
template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
|
118
|
+
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) {
|
|
119
|
+
typedef gebp_traits<LhsScalar, RhsScalar> Traits;
|
|
127
120
|
|
|
128
121
|
// Explanations:
|
|
129
122
|
// Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
|
|
@@ -132,7 +125,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
132
125
|
// at the register level. This small horizontal panel has to stay within L1 cache.
|
|
133
126
|
std::ptrdiff_t l1, l2, l3;
|
|
134
127
|
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
|
135
|
-
|
|
128
|
+
#ifdef EIGEN_VECTORIZE_AVX512
|
|
136
129
|
// We need to find a rationale for that, but without this adjustment,
|
|
137
130
|
// performance with AVX512 is pretty bad, like -20% slower.
|
|
138
131
|
// One reason is that with increasing packet-size, the blocking size k
|
|
@@ -141,13 +134,13 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
141
134
|
// k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
|
|
142
135
|
// This is quite small for a good reuse of the accumulation registers.
|
|
143
136
|
l1 *= 4;
|
|
144
|
-
|
|
137
|
+
#endif
|
|
145
138
|
|
|
146
139
|
if (num_threads > 1) {
|
|
147
140
|
typedef typename Traits::ResScalar ResScalar;
|
|
148
141
|
enum {
|
|
149
142
|
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
|
150
|
-
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
|
143
|
+
ksub = Traits::mr * (Traits::nr * sizeof(ResScalar)),
|
|
151
144
|
kr = 8,
|
|
152
145
|
mr = Traits::mr,
|
|
153
146
|
nr = Traits::nr
|
|
@@ -157,13 +150,13 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
157
150
|
// increasing the value of k, so we'll cap it at 320 (value determined
|
|
158
151
|
// experimentally).
|
|
159
152
|
// To avoid that k vanishes, we make k_cache at least as big as kr
|
|
160
|
-
const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
|
|
153
|
+
const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
|
|
161
154
|
if (k_cache < k) {
|
|
162
155
|
k = k_cache - (k_cache % kr);
|
|
163
156
|
eigen_internal_assert(k > 0);
|
|
164
157
|
}
|
|
165
158
|
|
|
166
|
-
const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
|
|
159
|
+
const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
|
|
167
160
|
const Index n_per_thread = numext::div_ceil(n, num_threads);
|
|
168
161
|
if (n_cache <= n_per_thread) {
|
|
169
162
|
// Don't exceed the capacity of the l2 cache.
|
|
@@ -176,37 +169,35 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
176
169
|
|
|
177
170
|
if (l3 > l2) {
|
|
178
171
|
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
|
|
179
|
-
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
|
172
|
+
const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
|
|
180
173
|
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
|
181
|
-
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
|
174
|
+
if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
|
182
175
|
m = m_cache - (m_cache % mr);
|
|
183
176
|
eigen_internal_assert(m > 0);
|
|
184
177
|
} else {
|
|
185
178
|
m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
|
186
179
|
}
|
|
187
180
|
}
|
|
188
|
-
}
|
|
189
|
-
else {
|
|
181
|
+
} else {
|
|
190
182
|
// In unit tests we do not want to use extra large matrices,
|
|
191
183
|
// so we reduce the cache size to check the blocking strategy is not flawed
|
|
192
184
|
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
|
193
|
-
l1 = 9*1024;
|
|
194
|
-
l2 = 32*1024;
|
|
195
|
-
l3 = 512*1024;
|
|
185
|
+
l1 = 9 * 1024;
|
|
186
|
+
l2 = 32 * 1024;
|
|
187
|
+
l3 = 512 * 1024;
|
|
196
188
|
#endif
|
|
197
189
|
|
|
198
190
|
// Early return for small problems because the computation below are time consuming for small problems.
|
|
199
191
|
// Perhaps it would make more sense to consider k*n*m??
|
|
200
192
|
// Note that for very tiny problem, this function should be bypassed anyway
|
|
201
193
|
// because we use the coefficient-based implementation for them.
|
|
202
|
-
if((numext::maxi)(k,(numext::maxi)(m,n))<48)
|
|
203
|
-
return;
|
|
194
|
+
if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
|
|
204
195
|
|
|
205
196
|
typedef typename Traits::ResScalar ResScalar;
|
|
206
197
|
enum {
|
|
207
198
|
k_peeling = 8,
|
|
208
199
|
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
|
209
|
-
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
|
|
200
|
+
k_sub = Traits::mr * (Traits::nr * sizeof(ResScalar))
|
|
210
201
|
};
|
|
211
202
|
|
|
212
203
|
// ---- 1st level of blocking on L1, yields kc ----
|
|
@@ -216,30 +207,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
216
207
|
// We also include a register-level block of the result (mx x nr).
|
|
217
208
|
// (In an ideal world only the lhs panel would stay in L1)
|
|
218
209
|
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
|
219
|
-
const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
|
|
210
|
+
const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
|
|
220
211
|
const Index old_k = k;
|
|
221
|
-
if(k>max_kc)
|
|
222
|
-
{
|
|
212
|
+
if (k > max_kc) {
|
|
223
213
|
// We are really blocking on the third dimension:
|
|
224
214
|
// -> reduce blocking size to make sure the last block is as large as possible
|
|
225
215
|
// while keeping the same number of sweeps over the result.
|
|
226
|
-
k = (k%max_kc)==0 ? max_kc
|
|
227
|
-
|
|
216
|
+
k = (k % max_kc) == 0 ? max_kc
|
|
217
|
+
: max_kc - k_peeling * ((max_kc - 1 - (k % max_kc)) / (k_peeling * (k / max_kc + 1)));
|
|
228
218
|
|
|
229
|
-
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
|
|
219
|
+
eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
|
|
230
220
|
}
|
|
231
221
|
|
|
232
|
-
|
|
222
|
+
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
|
|
233
223
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
224
|
+
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
|
|
225
|
+
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
|
|
226
|
+
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
|
|
227
|
+
// For instance, it corresponds to 6MB of L3 shared among 4 cores.
|
|
228
|
+
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
|
239
229
|
const Index actual_l2 = l3;
|
|
240
|
-
|
|
241
|
-
const Index actual_l2 = 1572864;
|
|
242
|
-
|
|
230
|
+
#else
|
|
231
|
+
const Index actual_l2 = 1572864; // == 1.5 MB
|
|
232
|
+
#endif
|
|
243
233
|
|
|
244
234
|
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
|
|
245
235
|
// The second half is implicitly reserved to access the result and lhs coefficients.
|
|
@@ -249,61 +239,52 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
|
|
249
239
|
// and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
|
|
250
240
|
Index max_nc;
|
|
251
241
|
const Index lhs_bytes = m * k * sizeof(LhsScalar);
|
|
252
|
-
const Index remaining_l1 = l1- k_sub - lhs_bytes;
|
|
253
|
-
if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
|
|
254
|
-
{
|
|
242
|
+
const Index remaining_l1 = l1 - k_sub - lhs_bytes;
|
|
243
|
+
if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
|
|
255
244
|
// L1 blocking
|
|
256
|
-
max_nc = remaining_l1 / (k*sizeof(RhsScalar));
|
|
257
|
-
}
|
|
258
|
-
else
|
|
259
|
-
{
|
|
245
|
+
max_nc = remaining_l1 / (k * sizeof(RhsScalar));
|
|
246
|
+
} else {
|
|
260
247
|
// L2 blocking
|
|
261
|
-
max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
|
248
|
+
max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
|
|
262
249
|
}
|
|
263
250
|
// WARNING Below, we assume that Traits::nr is a power of two.
|
|
264
|
-
Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
|
265
|
-
if(n>nc)
|
|
266
|
-
{
|
|
251
|
+
Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
|
|
252
|
+
if (n > nc) {
|
|
267
253
|
// We are really blocking over the columns:
|
|
268
254
|
// -> reduce blocking size to make sure the last block is as large as possible
|
|
269
255
|
// while keeping the same number of sweeps over the packed lhs.
|
|
270
256
|
// Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
|
|
271
|
-
n = (n%nc)==0 ? nc
|
|
272
|
-
|
|
273
|
-
}
|
|
274
|
-
else if(old_k==k)
|
|
275
|
-
{
|
|
257
|
+
n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
|
|
258
|
+
} else if (old_k == k) {
|
|
276
259
|
// So far, no blocking at all, i.e., kc==k, and nc==n.
|
|
277
260
|
// In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
|
|
278
|
-
// TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
|
|
279
|
-
|
|
261
|
+
// TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
|
|
262
|
+
// here should be obsolete.
|
|
263
|
+
Index problem_size = k * n * sizeof(LhsScalar);
|
|
280
264
|
Index actual_lm = actual_l2;
|
|
281
265
|
Index max_mc = m;
|
|
282
|
-
if(problem_size<=1024)
|
|
283
|
-
{
|
|
266
|
+
if (problem_size <= 1024) {
|
|
284
267
|
// problem is small enough to keep in L1
|
|
285
268
|
// Let's choose m such that lhs's block fit in 1/3 of L1
|
|
286
269
|
actual_lm = l1;
|
|
287
|
-
}
|
|
288
|
-
else if(l3!=0 && problem_size<=32768)
|
|
289
|
-
{
|
|
270
|
+
} else if (l3 != 0 && problem_size <= 32768) {
|
|
290
271
|
// we have both L2 and L3, and problem is small enough to be kept in L2
|
|
291
272
|
// Let's choose m such that lhs's block fit in 1/3 of L2
|
|
292
273
|
actual_lm = l2;
|
|
293
|
-
max_mc = (numext::mini<Index>)(576,max_mc);
|
|
274
|
+
max_mc = (numext::mini<Index>)(576, max_mc);
|
|
294
275
|
}
|
|
295
|
-
Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
|
296
|
-
if (mc > Traits::mr)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
276
|
+
Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
|
|
277
|
+
if (mc > Traits::mr)
|
|
278
|
+
mc -= mc % Traits::mr;
|
|
279
|
+
else if (mc == 0)
|
|
280
|
+
return;
|
|
281
|
+
m = (m % mc) == 0 ? mc : (mc - Traits::mr * ((mc /*-1*/ - (m % mc)) / (Traits::mr * (m / mc + 1))));
|
|
300
282
|
}
|
|
301
283
|
}
|
|
302
284
|
}
|
|
303
285
|
|
|
304
286
|
template <typename Index>
|
|
305
|
-
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
|
306
|
-
{
|
|
287
|
+
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) {
|
|
307
288
|
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
|
308
289
|
if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
|
|
309
290
|
k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
|
@@ -320,46 +301,47 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
|
|
320
301
|
}
|
|
321
302
|
|
|
322
303
|
/** \brief Computes the blocking parameters for a m x k times k x n matrix product
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
304
|
+
*
|
|
305
|
+
* \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
|
|
306
|
+
* \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
|
|
307
|
+
* \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same
|
|
308
|
+
* dimension.
|
|
309
|
+
* \param[in] num_threads Input: the number of threads used for the computation.
|
|
310
|
+
*
|
|
311
|
+
* Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
|
|
312
|
+
* this function computes the blocking size parameters along the respective dimensions
|
|
313
|
+
* for matrix products and related algorithms.
|
|
314
|
+
*
|
|
315
|
+
* The blocking size parameters may be evaluated:
|
|
316
|
+
* - either by a heuristic based on cache sizes;
|
|
317
|
+
* - or using fixed prescribed values (for testing purposes).
|
|
318
|
+
*
|
|
319
|
+
* \sa setCpuCacheSizes */
|
|
320
|
+
|
|
321
|
+
template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
|
322
|
+
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
|
|
341
323
|
if (!useSpecificBlockingSizes(k, m, n)) {
|
|
342
324
|
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
|
|
343
325
|
}
|
|
344
326
|
}
|
|
345
327
|
|
|
346
|
-
template<typename LhsScalar, typename RhsScalar, typename Index>
|
|
347
|
-
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
|
348
|
-
|
|
349
|
-
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
|
|
328
|
+
template <typename LhsScalar, typename RhsScalar, typename Index>
|
|
329
|
+
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
|
|
330
|
+
computeProductBlockingSizes<LhsScalar, RhsScalar, 1, Index>(k, m, n, num_threads);
|
|
350
331
|
}
|
|
351
332
|
|
|
352
333
|
template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
|
|
353
334
|
struct RhsPanelHelper {
|
|
354
335
|
private:
|
|
355
|
-
static
|
|
336
|
+
static constexpr int remaining_registers =
|
|
337
|
+
(std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0);
|
|
338
|
+
|
|
356
339
|
public:
|
|
357
|
-
typedef
|
|
340
|
+
typedef std::conditional_t<remaining_registers >= 4, RhsPacketx4, RhsPacket> type;
|
|
358
341
|
};
|
|
359
342
|
|
|
360
343
|
template <typename Packet>
|
|
361
|
-
struct QuadPacket
|
|
362
|
-
{
|
|
344
|
+
struct QuadPacket {
|
|
363
345
|
Packet B_0, B1, B2, B3;
|
|
364
346
|
const Packet& get(const FixedInt<0>&) const { return B_0; }
|
|
365
347
|
const Packet& get(const FixedInt<1>&) const { return B1; }
|
|
@@ -368,329 +350,295 @@ struct QuadPacket
|
|
|
368
350
|
};
|
|
369
351
|
|
|
370
352
|
template <int N, typename T1, typename T2, typename T3>
|
|
371
|
-
struct packet_conditional {
|
|
353
|
+
struct packet_conditional {
|
|
354
|
+
typedef T3 type;
|
|
355
|
+
};
|
|
372
356
|
|
|
373
357
|
template <typename T1, typename T2, typename T3>
|
|
374
|
-
struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
|
|
358
|
+
struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
|
|
359
|
+
typedef T1 type;
|
|
360
|
+
};
|
|
375
361
|
|
|
376
362
|
template <typename T1, typename T2, typename T3>
|
|
377
|
-
struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
#define PACKET_DECL_COND(name, packet_size)
|
|
387
|
-
typedef typename packet_conditional<
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
#define PACKET_DECL_COND_SCALAR(packet_size) \
|
|
401
|
-
typedef typename packet_conditional<packet_size, \
|
|
402
|
-
typename packet_traits<Scalar>::type, \
|
|
403
|
-
typename packet_traits<Scalar>::half, \
|
|
404
|
-
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
|
|
405
|
-
ScalarPacket
|
|
363
|
+
struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
|
|
364
|
+
typedef T2 type;
|
|
365
|
+
};
|
|
366
|
+
|
|
367
|
+
#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
|
|
368
|
+
typedef typename packet_conditional< \
|
|
369
|
+
packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
|
|
370
|
+
typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
|
|
371
|
+
|
|
372
|
+
#define PACKET_DECL_COND(name, packet_size) \
|
|
373
|
+
typedef typename packet_conditional< \
|
|
374
|
+
packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
|
|
375
|
+
typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet
|
|
376
|
+
|
|
377
|
+
#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
|
|
378
|
+
typedef typename packet_conditional< \
|
|
379
|
+
packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
|
|
380
|
+
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket##postfix
|
|
381
|
+
|
|
382
|
+
#define PACKET_DECL_COND_SCALAR(packet_size) \
|
|
383
|
+
typedef typename packet_conditional< \
|
|
384
|
+
packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
|
|
385
|
+
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket
|
|
406
386
|
|
|
407
387
|
/* Vectorization logic
|
|
408
388
|
* real*real: unpack rhs to constant packets, ...
|
|
409
|
-
*
|
|
389
|
+
*
|
|
410
390
|
* cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
|
|
411
391
|
* storing each res packet into two packets (2x2),
|
|
412
|
-
* at the end combine them: swap the second and addsub them
|
|
392
|
+
* at the end combine them: swap the second and addsub them
|
|
413
393
|
* cf*cf : same but with 2x4 blocks
|
|
414
394
|
* cplx*real : unpack rhs to constant packets, ...
|
|
415
395
|
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
|
|
416
396
|
*/
|
|
417
|
-
template<typename
|
|
418
|
-
class gebp_traits
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
typedef
|
|
422
|
-
typedef _RhsScalar RhsScalar;
|
|
397
|
+
template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
|
|
398
|
+
class gebp_traits {
|
|
399
|
+
public:
|
|
400
|
+
typedef LhsScalar_ LhsScalar;
|
|
401
|
+
typedef RhsScalar_ RhsScalar;
|
|
423
402
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
424
403
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
404
|
+
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
|
|
405
|
+
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
|
|
406
|
+
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
|
|
428
407
|
|
|
429
408
|
enum {
|
|
430
|
-
ConjLhs =
|
|
431
|
-
ConjRhs =
|
|
432
|
-
Vectorizable = unpacket_traits<
|
|
433
|
-
LhsPacketSize = Vectorizable ? unpacket_traits<
|
|
434
|
-
RhsPacketSize = Vectorizable ? unpacket_traits<
|
|
435
|
-
ResPacketSize = Vectorizable ? unpacket_traits<
|
|
436
|
-
|
|
409
|
+
ConjLhs = ConjLhs_,
|
|
410
|
+
ConjRhs = ConjRhs_,
|
|
411
|
+
Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
|
|
412
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
|
|
413
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
|
|
414
|
+
ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
|
|
415
|
+
|
|
437
416
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
438
417
|
|
|
439
418
|
// register block size along the N direction must be 1 or 4
|
|
440
419
|
nr = 4,
|
|
441
420
|
|
|
442
421
|
// register block size along the M direction (currently, this one cannot be modified)
|
|
443
|
-
default_mr = (
|
|
444
|
-
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) &&
|
|
445
|
-
&& ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
|
|
422
|
+
default_mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
|
|
423
|
+
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && \
|
|
424
|
+
!defined(EIGEN_VECTORIZE_VSX) && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC >= 1914))
|
|
446
425
|
// we assume 16 registers or more
|
|
447
426
|
// See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
|
|
448
427
|
// then using 3*LhsPacketSize triggers non-implemented paths in syrk.
|
|
449
428
|
// Bug 1515: MSVC prior to v19.14 yields to register spilling.
|
|
450
|
-
mr = Vectorizable ? 3*LhsPacketSize : default_mr,
|
|
429
|
+
mr = Vectorizable ? 3 * LhsPacketSize : default_mr,
|
|
451
430
|
#else
|
|
452
431
|
mr = default_mr,
|
|
453
432
|
#endif
|
|
454
|
-
|
|
433
|
+
|
|
455
434
|
LhsProgress = LhsPacketSize,
|
|
456
435
|
RhsProgress = 1
|
|
457
436
|
};
|
|
458
437
|
|
|
459
|
-
|
|
460
|
-
typedef
|
|
461
|
-
typedef
|
|
462
|
-
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
|
438
|
+
typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
|
|
439
|
+
typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
|
|
440
|
+
typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
|
|
463
441
|
typedef LhsPacket LhsPacket4Packing;
|
|
464
442
|
|
|
465
443
|
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
466
444
|
typedef ResPacket AccPacket;
|
|
467
|
-
|
|
468
|
-
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
469
|
-
{
|
|
470
|
-
p = pset1<ResPacket>(ResScalar(0));
|
|
471
|
-
}
|
|
472
445
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
446
|
+
EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
|
|
447
|
+
|
|
448
|
+
template <typename RhsPacketType>
|
|
449
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
|
|
476
450
|
dest = pset1<RhsPacketType>(*b);
|
|
477
451
|
}
|
|
478
452
|
|
|
479
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
480
|
-
{
|
|
453
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
|
|
481
454
|
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
482
455
|
}
|
|
483
456
|
|
|
484
|
-
template<typename RhsPacketType>
|
|
485
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
486
|
-
{
|
|
457
|
+
template <typename RhsPacketType>
|
|
458
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
|
|
487
459
|
loadRhs(b, dest);
|
|
488
460
|
}
|
|
489
461
|
|
|
490
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
491
|
-
{
|
|
492
|
-
}
|
|
462
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
|
|
493
463
|
|
|
494
|
-
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
495
|
-
{
|
|
496
|
-
dest = ploadquad<RhsPacket>(b);
|
|
497
|
-
}
|
|
464
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
|
|
498
465
|
|
|
499
|
-
template<typename LhsPacketType>
|
|
500
|
-
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
|
|
501
|
-
{
|
|
466
|
+
template <typename LhsPacketType>
|
|
467
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const {
|
|
502
468
|
dest = pload<LhsPacketType>(a);
|
|
503
469
|
}
|
|
504
470
|
|
|
505
|
-
template<typename LhsPacketType>
|
|
506
|
-
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
507
|
-
{
|
|
471
|
+
template <typename LhsPacketType>
|
|
472
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
|
|
508
473
|
dest = ploadu<LhsPacketType>(a);
|
|
509
474
|
}
|
|
510
475
|
|
|
511
|
-
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
512
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
|
|
513
|
-
|
|
514
|
-
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
|
|
476
|
+
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
477
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
|
|
478
|
+
const LaneIdType&) const {
|
|
479
|
+
conj_helper<LhsPacketType, RhsPacketType, ConjLhs, ConjRhs> cj;
|
|
515
480
|
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
|
|
516
481
|
// let gcc allocate the register in which to store the result of the pmul
|
|
517
482
|
// (in the case where there is no FMA) gcc fails to figure out how to avoid
|
|
518
483
|
// spilling register.
|
|
519
484
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
520
485
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
521
|
-
c = cj.pmadd(a,b,c);
|
|
486
|
+
c = cj.pmadd(a, b, c);
|
|
522
487
|
#else
|
|
523
|
-
tmp = b;
|
|
488
|
+
tmp = b;
|
|
489
|
+
tmp = cj.pmul(a, tmp);
|
|
490
|
+
c = padd(c, tmp);
|
|
524
491
|
#endif
|
|
525
492
|
}
|
|
526
493
|
|
|
527
|
-
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
528
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
529
|
-
|
|
494
|
+
template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
495
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
496
|
+
const LaneIdType& lane) const {
|
|
530
497
|
madd(a, b.get(lane), c, tmp, lane);
|
|
531
498
|
}
|
|
532
499
|
|
|
533
|
-
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
|
|
534
|
-
|
|
535
|
-
r = pmadd(c,alpha,r);
|
|
536
|
-
}
|
|
537
|
-
|
|
538
|
-
template<typename ResPacketHalf>
|
|
539
|
-
EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
|
|
540
|
-
{
|
|
541
|
-
r = pmadd(c,alpha,r);
|
|
500
|
+
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
|
|
501
|
+
r = pmadd(c, alpha, r);
|
|
542
502
|
}
|
|
543
503
|
|
|
504
|
+
template <typename ResPacketHalf>
|
|
505
|
+
EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const {
|
|
506
|
+
r = pmadd(c, alpha, r);
|
|
507
|
+
}
|
|
544
508
|
};
|
|
545
509
|
|
|
546
|
-
template<typename RealScalar, bool
|
|
547
|
-
class gebp_traits<std::complex<RealScalar>, RealScalar,
|
|
548
|
-
|
|
549
|
-
public:
|
|
510
|
+
template <typename RealScalar, bool ConjLhs_, int Arch, int PacketSize_>
|
|
511
|
+
class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_> {
|
|
512
|
+
public:
|
|
550
513
|
typedef std::complex<RealScalar> LhsScalar;
|
|
551
514
|
typedef RealScalar RhsScalar;
|
|
552
515
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
553
516
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
517
|
+
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
|
|
518
|
+
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
|
|
519
|
+
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
|
|
557
520
|
|
|
558
521
|
enum {
|
|
559
|
-
ConjLhs =
|
|
522
|
+
ConjLhs = ConjLhs_,
|
|
560
523
|
ConjRhs = false,
|
|
561
|
-
Vectorizable = unpacket_traits<
|
|
562
|
-
LhsPacketSize = Vectorizable ? unpacket_traits<
|
|
563
|
-
RhsPacketSize = Vectorizable ? unpacket_traits<
|
|
564
|
-
ResPacketSize = Vectorizable ? unpacket_traits<
|
|
565
|
-
|
|
524
|
+
Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
|
|
525
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
|
|
526
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
|
|
527
|
+
ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
|
|
528
|
+
|
|
566
529
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
567
530
|
nr = 4,
|
|
568
531
|
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
|
|
569
532
|
// we assume 16 registers
|
|
570
|
-
mr = 3*LhsPacketSize,
|
|
533
|
+
mr = 3 * LhsPacketSize,
|
|
571
534
|
#else
|
|
572
|
-
mr = (
|
|
535
|
+
mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
|
|
573
536
|
#endif
|
|
574
537
|
|
|
575
538
|
LhsProgress = LhsPacketSize,
|
|
576
539
|
RhsProgress = 1
|
|
577
540
|
};
|
|
578
541
|
|
|
579
|
-
typedef
|
|
580
|
-
typedef
|
|
581
|
-
typedef
|
|
542
|
+
typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
|
|
543
|
+
typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
|
|
544
|
+
typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
|
|
582
545
|
typedef LhsPacket LhsPacket4Packing;
|
|
583
546
|
|
|
584
547
|
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
585
548
|
|
|
586
549
|
typedef ResPacket AccPacket;
|
|
587
550
|
|
|
588
|
-
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
589
|
-
{
|
|
590
|
-
p = pset1<ResPacket>(ResScalar(0));
|
|
591
|
-
}
|
|
551
|
+
EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
|
|
592
552
|
|
|
593
|
-
template<typename RhsPacketType>
|
|
594
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
595
|
-
{
|
|
553
|
+
template <typename RhsPacketType>
|
|
554
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
|
|
596
555
|
dest = pset1<RhsPacketType>(*b);
|
|
597
556
|
}
|
|
598
557
|
|
|
599
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
600
|
-
{
|
|
558
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
|
|
601
559
|
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
602
560
|
}
|
|
603
561
|
|
|
604
|
-
template<typename RhsPacketType>
|
|
605
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
606
|
-
{
|
|
562
|
+
template <typename RhsPacketType>
|
|
563
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
|
|
607
564
|
loadRhs(b, dest);
|
|
608
565
|
}
|
|
609
566
|
|
|
610
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
{
|
|
615
|
-
loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
|
|
567
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
|
|
568
|
+
|
|
569
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const {
|
|
570
|
+
loadRhsQuad_impl(b, dest, std::conditional_t<RhsPacketSize == 16, true_type, false_type>());
|
|
616
571
|
}
|
|
617
572
|
|
|
618
|
-
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
|
|
619
|
-
{
|
|
573
|
+
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const {
|
|
620
574
|
// FIXME we can do better!
|
|
621
575
|
// what we want here is a ploadheight
|
|
622
|
-
RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
|
|
576
|
+
RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
|
|
623
577
|
dest = ploadquad<RhsPacket>(tmp);
|
|
624
578
|
}
|
|
625
579
|
|
|
626
|
-
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
|
|
627
|
-
|
|
628
|
-
eigen_internal_assert(RhsPacketSize<=8);
|
|
580
|
+
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const {
|
|
581
|
+
eigen_internal_assert(RhsPacketSize <= 8);
|
|
629
582
|
dest = pset1<RhsPacket>(*b);
|
|
630
583
|
}
|
|
631
584
|
|
|
632
|
-
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
633
|
-
{
|
|
634
|
-
dest = pload<LhsPacket>(a);
|
|
635
|
-
}
|
|
585
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload<LhsPacket>(a); }
|
|
636
586
|
|
|
637
|
-
template<typename LhsPacketType>
|
|
638
|
-
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
639
|
-
{
|
|
587
|
+
template <typename LhsPacketType>
|
|
588
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
|
|
640
589
|
dest = ploadu<LhsPacketType>(a);
|
|
641
590
|
}
|
|
642
591
|
|
|
643
592
|
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
644
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
|
|
645
|
-
|
|
646
|
-
madd_impl(a, b, c, tmp,
|
|
593
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
|
|
594
|
+
const LaneIdType&) const {
|
|
595
|
+
madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
|
|
647
596
|
}
|
|
648
597
|
|
|
649
598
|
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
650
|
-
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
|
|
651
|
-
|
|
599
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
|
|
600
|
+
RhsPacketType& tmp, const true_type&) const {
|
|
652
601
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
653
602
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
654
|
-
c.v = pmadd(a.v,b,c.v);
|
|
603
|
+
c.v = pmadd(a.v, b, c.v);
|
|
655
604
|
#else
|
|
656
|
-
tmp = b;
|
|
605
|
+
tmp = b;
|
|
606
|
+
tmp = pmul(a.v, tmp);
|
|
607
|
+
c.v = padd(c.v, tmp);
|
|
657
608
|
#endif
|
|
658
609
|
}
|
|
659
610
|
|
|
660
|
-
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
|
|
661
|
-
|
|
611
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
|
|
612
|
+
const false_type&) const {
|
|
662
613
|
c += a * b;
|
|
663
614
|
}
|
|
664
615
|
|
|
665
|
-
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
666
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
667
|
-
|
|
616
|
+
template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
617
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
618
|
+
const LaneIdType& lane) const {
|
|
668
619
|
madd(a, b.get(lane), c, tmp, lane);
|
|
669
620
|
}
|
|
670
621
|
|
|
671
622
|
template <typename ResPacketType, typename AccPacketType>
|
|
672
|
-
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
r = cj.pmadd(c,alpha,r);
|
|
623
|
+
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
|
|
624
|
+
conj_helper<ResPacketType, ResPacketType, ConjLhs, false> cj;
|
|
625
|
+
r = cj.pmadd(c, alpha, r);
|
|
676
626
|
}
|
|
677
627
|
|
|
678
|
-
protected:
|
|
628
|
+
protected:
|
|
679
629
|
};
|
|
680
630
|
|
|
681
|
-
template<typename Packet>
|
|
682
|
-
struct DoublePacket
|
|
683
|
-
{
|
|
631
|
+
template <typename Packet>
|
|
632
|
+
struct DoublePacket {
|
|
684
633
|
Packet first;
|
|
685
634
|
Packet second;
|
|
686
635
|
};
|
|
687
636
|
|
|
688
|
-
template<typename Packet>
|
|
689
|
-
DoublePacket<Packet> padd(const DoublePacket<Packet
|
|
690
|
-
{
|
|
637
|
+
template <typename Packet>
|
|
638
|
+
DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Packet>& b) {
|
|
691
639
|
DoublePacket<Packet> res;
|
|
692
|
-
res.first
|
|
693
|
-
res.second = padd(a.second,b.second);
|
|
640
|
+
res.first = padd(a.first, b.first);
|
|
641
|
+
res.second = padd(a.second, b.second);
|
|
694
642
|
return res;
|
|
695
643
|
}
|
|
696
644
|
|
|
@@ -698,52 +646,47 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
|
|
|
698
646
|
// corresponds to the number of complexes, so it means "8"
|
|
699
647
|
// it terms of real coefficients.
|
|
700
648
|
|
|
701
|
-
template<typename Packet>
|
|
702
|
-
const DoublePacket<Packet>&
|
|
703
|
-
|
|
704
|
-
typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
|
|
705
|
-
{
|
|
649
|
+
template <typename Packet>
|
|
650
|
+
const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
|
|
651
|
+
std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
|
|
706
652
|
return a;
|
|
707
653
|
}
|
|
708
654
|
|
|
709
|
-
template<typename Packet>
|
|
710
|
-
DoublePacket<typename unpacket_traits<Packet>::half>
|
|
711
|
-
|
|
712
|
-
typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
|
|
713
|
-
{
|
|
655
|
+
template <typename Packet>
|
|
656
|
+
DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
|
|
657
|
+
const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
|
|
714
658
|
// yes, that's pretty hackish :(
|
|
715
659
|
DoublePacket<typename unpacket_traits<Packet>::half> res;
|
|
716
660
|
typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
|
|
717
661
|
typedef typename packet_traits<Cplx>::type CplxPacket;
|
|
718
|
-
res.first
|
|
662
|
+
res.first = predux_half_dowto4(CplxPacket(a.first)).v;
|
|
719
663
|
res.second = predux_half_dowto4(CplxPacket(a.second)).v;
|
|
720
664
|
return res;
|
|
721
665
|
}
|
|
722
666
|
|
|
723
667
|
// same here, "quad" actually means "8" in terms of real coefficients
|
|
724
|
-
template<typename Scalar, typename RealPacket>
|
|
668
|
+
template <typename Scalar, typename RealPacket>
|
|
725
669
|
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
dest.first = pset1<RealPacket>(numext::real(*b));
|
|
670
|
+
std::enable_if_t<unpacket_traits<RealPacket>::size <= 8>* = 0) {
|
|
671
|
+
dest.first = pset1<RealPacket>(numext::real(*b));
|
|
729
672
|
dest.second = pset1<RealPacket>(numext::imag(*b));
|
|
730
673
|
}
|
|
731
674
|
|
|
732
|
-
template<typename Scalar, typename RealPacket>
|
|
675
|
+
template <typename Scalar, typename RealPacket>
|
|
733
676
|
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
|
|
734
|
-
|
|
735
|
-
{
|
|
677
|
+
std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
|
|
736
678
|
// yes, that's pretty hackish too :(
|
|
737
679
|
typedef typename NumTraits<Scalar>::Real RealScalar;
|
|
738
680
|
RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
|
|
739
681
|
RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
|
|
740
|
-
dest.first
|
|
682
|
+
dest.first = ploadquad<RealPacket>(r);
|
|
741
683
|
dest.second = ploadquad<RealPacket>(i);
|
|
742
684
|
}
|
|
743
685
|
|
|
744
|
-
|
|
745
|
-
|
|
686
|
+
template <typename Packet>
|
|
687
|
+
struct unpacket_traits<DoublePacket<Packet> > {
|
|
746
688
|
typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
|
|
689
|
+
enum { size = 2 * unpacket_traits<Packet>::size };
|
|
747
690
|
};
|
|
748
691
|
// template<typename Packet>
|
|
749
692
|
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
|
|
@@ -754,74 +697,66 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
|
|
|
754
697
|
// return res;
|
|
755
698
|
// }
|
|
756
699
|
|
|
757
|
-
template<typename RealScalar, bool
|
|
758
|
-
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>,
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
typedef std::complex<RealScalar>
|
|
762
|
-
typedef std::complex<RealScalar>
|
|
763
|
-
typedef std::complex<RealScalar>
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
PACKET_DECL_COND_SCALAR(_PacketSize);
|
|
700
|
+
template <typename RealScalar, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
|
|
701
|
+
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_> {
|
|
702
|
+
public:
|
|
703
|
+
typedef std::complex<RealScalar> Scalar;
|
|
704
|
+
typedef std::complex<RealScalar> LhsScalar;
|
|
705
|
+
typedef std::complex<RealScalar> RhsScalar;
|
|
706
|
+
typedef std::complex<RealScalar> ResScalar;
|
|
707
|
+
|
|
708
|
+
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
|
|
709
|
+
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
|
|
710
|
+
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
|
|
711
|
+
PACKET_DECL_COND(Real, PacketSize_);
|
|
712
|
+
PACKET_DECL_COND_SCALAR(PacketSize_);
|
|
771
713
|
|
|
772
714
|
enum {
|
|
773
|
-
ConjLhs =
|
|
774
|
-
ConjRhs =
|
|
775
|
-
Vectorizable = unpacket_traits<RealPacket>::vectorizable
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
|
|
715
|
+
ConjLhs = ConjLhs_,
|
|
716
|
+
ConjRhs = ConjRhs_,
|
|
717
|
+
Vectorizable = unpacket_traits<RealPacket>::vectorizable && unpacket_traits<ScalarPacket>::vectorizable,
|
|
718
|
+
ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
|
|
719
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
|
|
779
720
|
RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
|
|
780
|
-
RealPacketSize
|
|
721
|
+
RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
|
|
722
|
+
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
781
723
|
|
|
782
|
-
// FIXME: should depend on NumberOfRegisters
|
|
783
724
|
nr = 4,
|
|
784
|
-
mr = ResPacketSize,
|
|
725
|
+
mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
|
|
785
726
|
|
|
786
727
|
LhsProgress = ResPacketSize,
|
|
787
728
|
RhsProgress = 1
|
|
788
729
|
};
|
|
789
|
-
|
|
790
|
-
typedef DoublePacket<RealPacket> DoublePacketType;
|
|
791
730
|
|
|
792
|
-
typedef
|
|
793
|
-
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
|
|
794
|
-
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
|
|
795
|
-
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
|
|
796
|
-
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
|
|
731
|
+
typedef DoublePacket<RealPacket> DoublePacketType;
|
|
797
732
|
|
|
798
|
-
|
|
733
|
+
typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> LhsPacket4Packing;
|
|
734
|
+
typedef std::conditional_t<Vectorizable, RealPacket, Scalar> LhsPacket;
|
|
735
|
+
typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> RhsPacket;
|
|
736
|
+
typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> ResPacket;
|
|
737
|
+
typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> AccPacket;
|
|
738
|
+
|
|
739
|
+
// this actually holds 8 packets!
|
|
799
740
|
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
800
|
-
|
|
741
|
+
|
|
801
742
|
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
|
|
802
743
|
|
|
803
|
-
EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
|
|
804
|
-
|
|
805
|
-
p.
|
|
806
|
-
p.second = pset1<RealPacket>(RealScalar(0));
|
|
744
|
+
EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) {
|
|
745
|
+
p.first = pset1<RealPacket>(RealScalar(0));
|
|
746
|
+
p.second = pset1<RealPacket>(RealScalar(0));
|
|
807
747
|
}
|
|
808
748
|
|
|
809
749
|
// Scalar path
|
|
810
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
|
|
811
|
-
{
|
|
812
|
-
dest = pset1<ScalarPacket>(*b);
|
|
813
|
-
}
|
|
750
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1<ScalarPacket>(*b); }
|
|
814
751
|
|
|
815
752
|
// Vectorized path
|
|
816
|
-
template<typename RealPacketType>
|
|
817
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
|
|
818
|
-
|
|
819
|
-
dest.first = pset1<RealPacketType>(numext::real(*b));
|
|
753
|
+
template <typename RealPacketType>
|
|
754
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
|
|
755
|
+
dest.first = pset1<RealPacketType>(numext::real(*b));
|
|
820
756
|
dest.second = pset1<RealPacketType>(numext::imag(*b));
|
|
821
757
|
}
|
|
822
758
|
|
|
823
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
824
|
-
{
|
|
759
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
|
|
825
760
|
loadRhs(b, dest.B_0);
|
|
826
761
|
loadRhs(b + 1, dest.B1);
|
|
827
762
|
loadRhs(b + 2, dest.B2);
|
|
@@ -829,221 +764,189 @@ public:
|
|
|
829
764
|
}
|
|
830
765
|
|
|
831
766
|
// Scalar path
|
|
832
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
|
|
833
|
-
{
|
|
834
|
-
loadRhs(b, dest);
|
|
835
|
-
}
|
|
767
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); }
|
|
836
768
|
|
|
837
769
|
// Vectorized path
|
|
838
|
-
template<typename RealPacketType>
|
|
839
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
|
|
840
|
-
{
|
|
770
|
+
template <typename RealPacketType>
|
|
771
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
|
|
841
772
|
loadRhs(b, dest);
|
|
842
773
|
}
|
|
843
774
|
|
|
844
775
|
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
|
|
845
|
-
|
|
846
|
-
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
|
|
847
|
-
{
|
|
848
|
-
|
|
849
|
-
}
|
|
850
|
-
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
|
|
851
|
-
{
|
|
852
|
-
loadQuadToDoublePacket(b,dest);
|
|
776
|
+
|
|
777
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); }
|
|
778
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const {
|
|
779
|
+
loadQuadToDoublePacket(b, dest);
|
|
853
780
|
}
|
|
854
781
|
|
|
855
782
|
// nothing special here
|
|
856
|
-
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
857
|
-
{
|
|
783
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const {
|
|
858
784
|
dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
|
|
859
785
|
}
|
|
860
786
|
|
|
861
|
-
template<typename LhsPacketType>
|
|
862
|
-
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
863
|
-
{
|
|
787
|
+
template <typename LhsPacketType>
|
|
788
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
|
|
864
789
|
dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
|
|
865
790
|
}
|
|
866
791
|
|
|
867
|
-
template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType,
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
792
|
+
template <typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType,
|
|
793
|
+
typename LaneIdType>
|
|
794
|
+
EIGEN_STRONG_INLINE std::enable_if_t<!is_same<RhsPacketType, RhsPacketx4>::value> madd(const LhsPacketType& a,
|
|
795
|
+
const RhsPacketType& b,
|
|
796
|
+
DoublePacket<ResPacketType>& c,
|
|
797
|
+
TmpType& /*tmp*/,
|
|
798
|
+
const LaneIdType&) const {
|
|
799
|
+
c.first = pmadd(a, b.first, c.first);
|
|
800
|
+
c.second = pmadd(a, b.second, c.second);
|
|
874
801
|
}
|
|
875
802
|
|
|
876
|
-
template<typename LaneIdType>
|
|
877
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/,
|
|
878
|
-
|
|
879
|
-
c = cj.pmadd(a,b,c);
|
|
803
|
+
template <typename LaneIdType>
|
|
804
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/,
|
|
805
|
+
const LaneIdType&) const {
|
|
806
|
+
c = cj.pmadd(a, b, c);
|
|
880
807
|
}
|
|
881
808
|
|
|
882
|
-
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
883
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
884
|
-
|
|
809
|
+
template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
810
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
811
|
+
const LaneIdType& lane) const {
|
|
885
812
|
madd(a, b.get(lane), c, tmp, lane);
|
|
886
813
|
}
|
|
887
|
-
|
|
814
|
+
|
|
888
815
|
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
|
|
889
|
-
|
|
890
|
-
template<typename RealPacketType, typename ResPacketType>
|
|
891
|
-
EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha,
|
|
892
|
-
|
|
816
|
+
|
|
817
|
+
template <typename RealPacketType, typename ResPacketType>
|
|
818
|
+
EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha,
|
|
819
|
+
ResPacketType& r) const {
|
|
893
820
|
// assemble c
|
|
894
821
|
ResPacketType tmp;
|
|
895
|
-
if((!ConjLhs)&&(!ConjRhs))
|
|
896
|
-
{
|
|
822
|
+
if ((!ConjLhs) && (!ConjRhs)) {
|
|
897
823
|
tmp = pcplxflip(pconj(ResPacketType(c.second)));
|
|
898
|
-
tmp = padd(ResPacketType(c.first),tmp);
|
|
899
|
-
}
|
|
900
|
-
else if((!ConjLhs)&&(ConjRhs))
|
|
901
|
-
{
|
|
824
|
+
tmp = padd(ResPacketType(c.first), tmp);
|
|
825
|
+
} else if ((!ConjLhs) && (ConjRhs)) {
|
|
902
826
|
tmp = pconj(pcplxflip(ResPacketType(c.second)));
|
|
903
|
-
tmp = padd(ResPacketType(c.first),tmp);
|
|
904
|
-
}
|
|
905
|
-
else if((ConjLhs)&&(!ConjRhs))
|
|
906
|
-
{
|
|
827
|
+
tmp = padd(ResPacketType(c.first), tmp);
|
|
828
|
+
} else if ((ConjLhs) && (!ConjRhs)) {
|
|
907
829
|
tmp = pcplxflip(ResPacketType(c.second));
|
|
908
|
-
tmp = padd(pconj(ResPacketType(c.first)),tmp);
|
|
909
|
-
}
|
|
910
|
-
else if((ConjLhs)&&(ConjRhs))
|
|
911
|
-
{
|
|
830
|
+
tmp = padd(pconj(ResPacketType(c.first)), tmp);
|
|
831
|
+
} else if ((ConjLhs) && (ConjRhs)) {
|
|
912
832
|
tmp = pcplxflip(ResPacketType(c.second));
|
|
913
|
-
tmp = psub(pconj(ResPacketType(c.first)),tmp);
|
|
833
|
+
tmp = psub(pconj(ResPacketType(c.first)), tmp);
|
|
914
834
|
}
|
|
915
|
-
|
|
916
|
-
r = pmadd(tmp,alpha,r);
|
|
835
|
+
|
|
836
|
+
r = pmadd(tmp, alpha, r);
|
|
917
837
|
}
|
|
918
838
|
|
|
919
|
-
protected:
|
|
920
|
-
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
|
839
|
+
protected:
|
|
840
|
+
conj_helper<LhsScalar, RhsScalar, ConjLhs, ConjRhs> cj;
|
|
921
841
|
};
|
|
922
842
|
|
|
923
|
-
template<typename RealScalar, bool
|
|
924
|
-
class gebp_traits<RealScalar, std::complex<RealScalar>, false,
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
typedef
|
|
928
|
-
typedef
|
|
929
|
-
typedef Scalar
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
#undef
|
|
939
|
-
#undef PACKET_DECL_COND_PREFIX
|
|
843
|
+
template <typename RealScalar, bool ConjRhs_, int Arch, int PacketSize_>
|
|
844
|
+
class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_> {
|
|
845
|
+
public:
|
|
846
|
+
typedef std::complex<RealScalar> Scalar;
|
|
847
|
+
typedef RealScalar LhsScalar;
|
|
848
|
+
typedef Scalar RhsScalar;
|
|
849
|
+
typedef Scalar ResScalar;
|
|
850
|
+
|
|
851
|
+
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
|
|
852
|
+
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
|
|
853
|
+
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
|
|
854
|
+
PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_);
|
|
855
|
+
PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_);
|
|
856
|
+
|
|
857
|
+
#undef PACKET_DECL_COND_SCALAR_POSTFIX
|
|
858
|
+
#undef PACKET_DECL_COND_POSTFIX
|
|
940
859
|
#undef PACKET_DECL_COND_SCALAR
|
|
941
860
|
#undef PACKET_DECL_COND
|
|
942
861
|
|
|
943
862
|
enum {
|
|
944
863
|
ConjLhs = false,
|
|
945
|
-
ConjRhs =
|
|
946
|
-
Vectorizable = unpacket_traits<
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
864
|
+
ConjRhs = ConjRhs_,
|
|
865
|
+
Vectorizable = unpacket_traits<RealPacket_>::vectorizable && unpacket_traits<ScalarPacket_>::vectorizable,
|
|
866
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
|
|
867
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
|
|
868
|
+
ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
|
|
869
|
+
|
|
952
870
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
|
953
871
|
// FIXME: should depend on NumberOfRegisters
|
|
954
872
|
nr = 4,
|
|
955
|
-
mr = (
|
|
873
|
+
mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
|
|
956
874
|
|
|
957
875
|
LhsProgress = ResPacketSize,
|
|
958
876
|
RhsProgress = 1
|
|
959
877
|
};
|
|
960
878
|
|
|
961
|
-
typedef
|
|
962
|
-
typedef
|
|
963
|
-
typedef
|
|
879
|
+
typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
|
|
880
|
+
typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
|
|
881
|
+
typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
|
|
964
882
|
typedef LhsPacket LhsPacket4Packing;
|
|
965
883
|
typedef QuadPacket<RhsPacket> RhsPacketx4;
|
|
966
884
|
typedef ResPacket AccPacket;
|
|
967
885
|
|
|
968
|
-
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
|
|
969
|
-
{
|
|
970
|
-
p = pset1<ResPacket>(ResScalar(0));
|
|
971
|
-
}
|
|
886
|
+
EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
|
|
972
887
|
|
|
973
|
-
template<typename RhsPacketType>
|
|
974
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
975
|
-
{
|
|
888
|
+
template <typename RhsPacketType>
|
|
889
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
|
|
976
890
|
dest = pset1<RhsPacketType>(*b);
|
|
977
891
|
}
|
|
978
892
|
|
|
979
|
-
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
|
|
980
|
-
{
|
|
893
|
+
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
|
|
981
894
|
pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
|
|
982
895
|
}
|
|
983
896
|
|
|
984
|
-
template<typename RhsPacketType>
|
|
985
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
|
|
986
|
-
{
|
|
897
|
+
template <typename RhsPacketType>
|
|
898
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
|
|
987
899
|
loadRhs(b, dest);
|
|
988
900
|
}
|
|
989
901
|
|
|
990
|
-
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
|
|
991
|
-
{}
|
|
902
|
+
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
|
|
992
903
|
|
|
993
|
-
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
}
|
|
997
|
-
|
|
998
|
-
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
|
|
999
|
-
{
|
|
1000
|
-
dest = ploadquad<RhsPacket>(b);
|
|
1001
|
-
}
|
|
904
|
+
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); }
|
|
905
|
+
|
|
906
|
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
|
|
1002
907
|
|
|
1003
|
-
template<typename LhsPacketType>
|
|
1004
|
-
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
|
|
1005
|
-
{
|
|
908
|
+
template <typename LhsPacketType>
|
|
909
|
+
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
|
|
1006
910
|
dest = ploaddup<LhsPacketType>(a);
|
|
1007
911
|
}
|
|
1008
912
|
|
|
1009
913
|
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
1010
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
|
|
1011
|
-
|
|
1012
|
-
madd_impl(a, b, c, tmp,
|
|
914
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
|
|
915
|
+
const LaneIdType&) const {
|
|
916
|
+
madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
|
|
1013
917
|
}
|
|
1014
918
|
|
|
1015
919
|
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
|
1016
|
-
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
|
|
1017
|
-
|
|
920
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
|
|
921
|
+
RhsPacketType& tmp, const true_type&) const {
|
|
1018
922
|
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
1019
923
|
EIGEN_UNUSED_VARIABLE(tmp);
|
|
1020
|
-
c.v = pmadd(a,b.v,c.v);
|
|
924
|
+
c.v = pmadd(a, b.v, c.v);
|
|
1021
925
|
#else
|
|
1022
|
-
tmp = b;
|
|
926
|
+
tmp = b;
|
|
927
|
+
tmp.v = pmul(a, tmp.v);
|
|
928
|
+
c = padd(c, tmp);
|
|
1023
929
|
#endif
|
|
1024
|
-
|
|
1025
930
|
}
|
|
1026
931
|
|
|
1027
|
-
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
|
|
1028
|
-
|
|
932
|
+
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
|
|
933
|
+
const false_type&) const {
|
|
1029
934
|
c += a * b;
|
|
1030
935
|
}
|
|
1031
936
|
|
|
1032
|
-
template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
1033
|
-
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
1034
|
-
|
|
937
|
+
template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
|
|
938
|
+
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
|
|
939
|
+
const LaneIdType& lane) const {
|
|
1035
940
|
madd(a, b.get(lane), c, tmp, lane);
|
|
1036
941
|
}
|
|
1037
942
|
|
|
1038
943
|
template <typename ResPacketType, typename AccPacketType>
|
|
1039
|
-
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
r = cj.pmadd(alpha,c,r);
|
|
944
|
+
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
|
|
945
|
+
conj_helper<ResPacketType, ResPacketType, false, ConjRhs> cj;
|
|
946
|
+
r = cj.pmadd(alpha, c, r);
|
|
1043
947
|
}
|
|
1044
948
|
|
|
1045
|
-
protected:
|
|
1046
|
-
|
|
949
|
+
protected:
|
|
1047
950
|
};
|
|
1048
951
|
|
|
1049
952
|
/* optimized General packed Block * packed Panel product kernel
|
|
@@ -1053,13 +956,15 @@ protected:
|
|
|
1053
956
|
* |real |cplx | no vectorization yet, would require to pack A with duplication
|
|
1054
957
|
* |cplx |real | easy vectorization
|
|
1055
958
|
*/
|
|
1056
|
-
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
1057
|
-
|
|
1058
|
-
{
|
|
1059
|
-
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1060
|
-
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf>
|
|
1061
|
-
|
|
1062
|
-
|
|
959
|
+
template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
960
|
+
bool ConjugateLhs, bool ConjugateRhs>
|
|
961
|
+
struct gebp_kernel {
|
|
962
|
+
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
|
|
963
|
+
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf>
|
|
964
|
+
HalfTraits;
|
|
965
|
+
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter>
|
|
966
|
+
QuarterTraits;
|
|
967
|
+
|
|
1063
968
|
typedef typename Traits::ResScalar ResScalar;
|
|
1064
969
|
typedef typename Traits::LhsPacket LhsPacket;
|
|
1065
970
|
typedef typename Traits::RhsPacket RhsPacket;
|
|
@@ -1068,8 +973,9 @@ struct gebp_kernel
|
|
|
1068
973
|
typedef typename Traits::RhsPacketx4 RhsPacketx4;
|
|
1069
974
|
|
|
1070
975
|
typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
|
|
976
|
+
typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type RhsPanel27;
|
|
1071
977
|
|
|
1072
|
-
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
978
|
+
typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
|
|
1073
979
|
|
|
1074
980
|
typedef typename SwappedTraits::ResScalar SResScalar;
|
|
1075
981
|
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
@@ -1090,28 +996,28 @@ struct gebp_kernel
|
|
|
1090
996
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
1091
997
|
|
|
1092
998
|
enum {
|
|
1093
|
-
Vectorizable
|
|
1094
|
-
LhsProgress
|
|
1095
|
-
LhsProgressHalf
|
|
1096
|
-
LhsProgressQuarter
|
|
1097
|
-
RhsProgress
|
|
1098
|
-
RhsProgressHalf
|
|
1099
|
-
RhsProgressQuarter
|
|
999
|
+
Vectorizable = Traits::Vectorizable,
|
|
1000
|
+
LhsProgress = Traits::LhsProgress,
|
|
1001
|
+
LhsProgressHalf = HalfTraits::LhsProgress,
|
|
1002
|
+
LhsProgressQuarter = QuarterTraits::LhsProgress,
|
|
1003
|
+
RhsProgress = Traits::RhsProgress,
|
|
1004
|
+
RhsProgressHalf = HalfTraits::RhsProgress,
|
|
1005
|
+
RhsProgressQuarter = QuarterTraits::RhsProgress,
|
|
1100
1006
|
ResPacketSize = Traits::ResPacketSize
|
|
1101
1007
|
};
|
|
1102
1008
|
|
|
1103
|
-
EIGEN_DONT_INLINE
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
|
|
1009
|
+
EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
|
|
1010
|
+
Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
|
|
1011
|
+
Index offsetA = 0, Index offsetB = 0);
|
|
1107
1012
|
};
|
|
1108
1013
|
|
|
1109
|
-
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
typedef gebp_traits<RhsScalar,
|
|
1014
|
+
template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
1015
|
+
bool ConjugateLhs, bool ConjugateRhs,
|
|
1016
|
+
int SwappedLhsProgress =
|
|
1017
|
+
gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target>::LhsProgress>
|
|
1018
|
+
struct last_row_process_16_packets {
|
|
1019
|
+
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
|
|
1020
|
+
typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
|
|
1115
1021
|
|
|
1116
1022
|
typedef typename Traits::ResScalar ResScalar;
|
|
1117
1023
|
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
@@ -1119,28 +1025,27 @@ struct last_row_process_16_packets
|
|
|
1119
1025
|
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
1120
1026
|
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
1121
1027
|
|
|
1122
|
-
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
}
|
|
1028
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
|
|
1029
|
+
const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
|
|
1030
|
+
ResScalar alpha, SAccPacket& C0) {
|
|
1031
|
+
EIGEN_UNUSED_VARIABLE(res);
|
|
1032
|
+
EIGEN_UNUSED_VARIABLE(straits);
|
|
1033
|
+
EIGEN_UNUSED_VARIABLE(blA);
|
|
1034
|
+
EIGEN_UNUSED_VARIABLE(blB);
|
|
1035
|
+
EIGEN_UNUSED_VARIABLE(depth);
|
|
1036
|
+
EIGEN_UNUSED_VARIABLE(endk);
|
|
1037
|
+
EIGEN_UNUSED_VARIABLE(i);
|
|
1038
|
+
EIGEN_UNUSED_VARIABLE(j2);
|
|
1039
|
+
EIGEN_UNUSED_VARIABLE(alpha);
|
|
1040
|
+
EIGEN_UNUSED_VARIABLE(C0);
|
|
1041
|
+
}
|
|
1137
1042
|
};
|
|
1138
1043
|
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,
|
|
1142
|
-
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
|
|
1143
|
-
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
|
|
1044
|
+
template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
1045
|
+
bool ConjugateLhs, bool ConjugateRhs>
|
|
1046
|
+
struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
|
|
1047
|
+
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
|
|
1048
|
+
typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
|
|
1144
1049
|
|
|
1145
1050
|
typedef typename Traits::ResScalar ResScalar;
|
|
1146
1051
|
typedef typename SwappedTraits::LhsPacket SLhsPacket;
|
|
@@ -1148,10 +1053,9 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
|
|
|
1148
1053
|
typedef typename SwappedTraits::ResPacket SResPacket;
|
|
1149
1054
|
typedef typename SwappedTraits::AccPacket SAccPacket;
|
|
1150
1055
|
|
|
1151
|
-
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
{
|
|
1056
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
|
|
1057
|
+
const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
|
|
1058
|
+
ResScalar alpha, SAccPacket& C0) {
|
|
1155
1059
|
typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
|
|
1156
1060
|
typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
|
|
1157
1061
|
typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
|
|
@@ -1160,71 +1064,190 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
|
|
|
1160
1064
|
SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
|
|
1161
1065
|
SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
|
|
1162
1066
|
|
|
1163
|
-
if (depth - endk > 0)
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
blB += SwappedTraits::LhsProgress/4;
|
|
1177
|
-
blA += 1;
|
|
1178
|
-
}
|
|
1179
|
-
straits.acc(c0, alphav, R);
|
|
1180
|
-
}
|
|
1181
|
-
else
|
|
1182
|
-
{
|
|
1183
|
-
straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
|
|
1067
|
+
if (depth - endk > 0) {
|
|
1068
|
+
// We have to handle the last row(s) of the rhs, which
|
|
1069
|
+
// correspond to a half-packet
|
|
1070
|
+
SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
|
|
1071
|
+
|
|
1072
|
+
for (Index kk = endk; kk < depth; kk++) {
|
|
1073
|
+
SLhsPacketQuarter a0;
|
|
1074
|
+
SRhsPacketQuarter b0;
|
|
1075
|
+
straits.loadLhsUnaligned(blB, a0);
|
|
1076
|
+
straits.loadRhs(blA, b0);
|
|
1077
|
+
straits.madd(a0, b0, c0, b0, fix<0>);
|
|
1078
|
+
blB += SwappedTraits::LhsProgress / 4;
|
|
1079
|
+
blA += 1;
|
|
1184
1080
|
}
|
|
1081
|
+
straits.acc(c0, alphav, R);
|
|
1082
|
+
} else {
|
|
1083
|
+
straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
|
|
1084
|
+
}
|
|
1185
1085
|
res.scatterPacket(i, j2, R);
|
|
1186
1086
|
}
|
|
1187
1087
|
};
|
|
1188
1088
|
|
|
1189
|
-
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
|
|
1190
|
-
|
|
1191
|
-
|
|
1089
|
+
template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
|
|
1090
|
+
typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
|
|
1091
|
+
typename LinearMapper, typename DataMapper>
|
|
1092
|
+
struct lhs_process_one_packet {
|
|
1192
1093
|
typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
|
|
1193
1094
|
|
|
1194
|
-
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
|
|
1195
|
-
|
|
1095
|
+
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
|
|
1096
|
+
LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
|
|
1097
|
+
AccPacket* C1, AccPacket* C2, AccPacket* C3) {
|
|
1196
1098
|
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
|
|
1197
1099
|
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
|
|
1198
|
-
traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
|
|
1199
|
-
traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
|
|
1100
|
+
traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
|
|
1101
|
+
traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
|
|
1200
1102
|
traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
|
|
1201
1103
|
traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
|
|
1202
1104
|
traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
|
|
1203
1105
|
traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
|
|
1204
|
-
|
|
1205
|
-
__asm__
|
|
1206
|
-
|
|
1106
|
+
#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
|
|
1107
|
+
__asm__("" : "+x,m"(*A0));
|
|
1108
|
+
#endif
|
|
1207
1109
|
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
|
|
1208
1110
|
}
|
|
1209
1111
|
|
|
1210
|
-
EIGEN_STRONG_INLINE void operator()(
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
{
|
|
1112
|
+
EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
|
1113
|
+
ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
|
|
1114
|
+
Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
|
|
1115
|
+
Index cols, Index depth, Index packet_cols4) {
|
|
1215
1116
|
GEBPTraits traits;
|
|
1216
|
-
|
|
1117
|
+
Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
|
|
1217
1118
|
// loops on each largest micro horizontal panel of lhs
|
|
1218
1119
|
// (LhsProgress x depth)
|
|
1219
|
-
for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
|
|
1220
|
-
|
|
1120
|
+
for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
|
|
1121
|
+
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
|
|
1122
|
+
EIGEN_IF_CONSTEXPR(nr >= 8) {
|
|
1123
|
+
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
|
|
1124
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
|
|
1125
|
+
prefetch(&blA[0]);
|
|
1126
|
+
|
|
1127
|
+
// gets res block as register
|
|
1128
|
+
AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
|
|
1129
|
+
traits.initAcc(C0);
|
|
1130
|
+
traits.initAcc(C1);
|
|
1131
|
+
traits.initAcc(C2);
|
|
1132
|
+
traits.initAcc(C3);
|
|
1133
|
+
traits.initAcc(C4);
|
|
1134
|
+
traits.initAcc(C5);
|
|
1135
|
+
traits.initAcc(C6);
|
|
1136
|
+
traits.initAcc(C7);
|
|
1137
|
+
|
|
1138
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1139
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1140
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1141
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1142
|
+
LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
|
|
1143
|
+
LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
|
|
1144
|
+
LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
|
|
1145
|
+
LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
|
|
1146
|
+
r0.prefetch(prefetch_res_offset);
|
|
1147
|
+
r1.prefetch(prefetch_res_offset);
|
|
1148
|
+
r2.prefetch(prefetch_res_offset);
|
|
1149
|
+
r3.prefetch(prefetch_res_offset);
|
|
1150
|
+
r4.prefetch(prefetch_res_offset);
|
|
1151
|
+
r5.prefetch(prefetch_res_offset);
|
|
1152
|
+
r6.prefetch(prefetch_res_offset);
|
|
1153
|
+
r7.prefetch(prefetch_res_offset);
|
|
1154
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
|
|
1155
|
+
prefetch(&blB[0]);
|
|
1156
|
+
|
|
1157
|
+
LhsPacket A0;
|
|
1158
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1159
|
+
RhsPacketx4 rhs_panel;
|
|
1160
|
+
RhsPacket T0;
|
|
1161
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1162
|
+
do { \
|
|
1163
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
|
|
1164
|
+
traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
|
|
1165
|
+
traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1166
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1167
|
+
traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1168
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1169
|
+
traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1170
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1171
|
+
traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1172
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1173
|
+
traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1174
|
+
traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
|
|
1175
|
+
traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1176
|
+
traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
|
|
1177
|
+
traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1178
|
+
traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
|
|
1179
|
+
traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
|
|
1180
|
+
traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
|
|
1181
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
|
|
1182
|
+
} while (false)
|
|
1183
|
+
|
|
1184
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
|
|
1185
|
+
|
|
1186
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
1187
|
+
EIGEN_GEBGP_ONESTEP(1);
|
|
1188
|
+
EIGEN_GEBGP_ONESTEP(2);
|
|
1189
|
+
EIGEN_GEBGP_ONESTEP(3);
|
|
1190
|
+
EIGEN_GEBGP_ONESTEP(4);
|
|
1191
|
+
EIGEN_GEBGP_ONESTEP(5);
|
|
1192
|
+
EIGEN_GEBGP_ONESTEP(6);
|
|
1193
|
+
EIGEN_GEBGP_ONESTEP(7);
|
|
1194
|
+
|
|
1195
|
+
blB += pk * 8 * RhsProgress;
|
|
1196
|
+
blA += pk * (1 * LhsProgress);
|
|
1197
|
+
|
|
1198
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
|
|
1199
|
+
}
|
|
1200
|
+
// process remaining peeled loop
|
|
1201
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1202
|
+
RhsPacketx4 rhs_panel;
|
|
1203
|
+
RhsPacket T0;
|
|
1204
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
1205
|
+
blB += 8 * RhsProgress;
|
|
1206
|
+
blA += 1 * LhsProgress;
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
#undef EIGEN_GEBGP_ONESTEP
|
|
1210
|
+
|
|
1211
|
+
ResPacket R0, R1;
|
|
1212
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1213
|
+
|
|
1214
|
+
R0 = r0.template loadPacket<ResPacket>(0);
|
|
1215
|
+
R1 = r1.template loadPacket<ResPacket>(0);
|
|
1216
|
+
traits.acc(C0, alphav, R0);
|
|
1217
|
+
traits.acc(C1, alphav, R1);
|
|
1218
|
+
r0.storePacket(0, R0);
|
|
1219
|
+
r1.storePacket(0, R1);
|
|
1220
|
+
|
|
1221
|
+
R0 = r2.template loadPacket<ResPacket>(0);
|
|
1222
|
+
R1 = r3.template loadPacket<ResPacket>(0);
|
|
1223
|
+
traits.acc(C2, alphav, R0);
|
|
1224
|
+
traits.acc(C3, alphav, R1);
|
|
1225
|
+
r2.storePacket(0, R0);
|
|
1226
|
+
r3.storePacket(0, R1);
|
|
1227
|
+
|
|
1228
|
+
R0 = r4.template loadPacket<ResPacket>(0);
|
|
1229
|
+
R1 = r5.template loadPacket<ResPacket>(0);
|
|
1230
|
+
traits.acc(C4, alphav, R0);
|
|
1231
|
+
traits.acc(C5, alphav, R1);
|
|
1232
|
+
r4.storePacket(0, R0);
|
|
1233
|
+
r5.storePacket(0, R1);
|
|
1234
|
+
|
|
1235
|
+
R0 = r6.template loadPacket<ResPacket>(0);
|
|
1236
|
+
R1 = r7.template loadPacket<ResPacket>(0);
|
|
1237
|
+
traits.acc(C6, alphav, R0);
|
|
1238
|
+
traits.acc(C7, alphav, R1);
|
|
1239
|
+
r6.storePacket(0, R0);
|
|
1240
|
+
r7.storePacket(0, R1);
|
|
1241
|
+
}
|
|
1242
|
+
}
|
|
1243
|
+
#endif
|
|
1244
|
+
|
|
1221
1245
|
// loops on each largest micro vertical panel of rhs (depth * nr)
|
|
1222
|
-
for(Index j2=
|
|
1223
|
-
{
|
|
1246
|
+
for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
|
|
1224
1247
|
// We select a LhsProgress x nr micro block of res
|
|
1225
1248
|
// which is entirely stored into 1 x nr registers.
|
|
1226
1249
|
|
|
1227
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
|
|
1250
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
|
|
1228
1251
|
prefetch(&blA[0]);
|
|
1229
1252
|
|
|
1230
1253
|
// gets res block as register
|
|
@@ -1235,7 +1258,7 @@ struct lhs_process_one_packet
|
|
|
1235
1258
|
traits.initAcc(C3);
|
|
1236
1259
|
// To improve instruction pipelining, let's double the accumulation registers:
|
|
1237
1260
|
// even k will accumulate in C*, while odd k will accumulate in D*.
|
|
1238
|
-
// This trick is
|
|
1261
|
+
// This trick is crucial to get good performance with FMA, otherwise it is
|
|
1239
1262
|
// actually faster to perform separated MUL+ADD because of a naturally
|
|
1240
1263
|
// better instruction-level parallelism.
|
|
1241
1264
|
AccPacket D0, D1, D2, D3;
|
|
@@ -1255,44 +1278,42 @@ struct lhs_process_one_packet
|
|
|
1255
1278
|
r3.prefetch(prefetch_res_offset);
|
|
1256
1279
|
|
|
1257
1280
|
// performs "inner" products
|
|
1258
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB*
|
|
1281
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
|
|
1259
1282
|
prefetch(&blB[0]);
|
|
1260
1283
|
LhsPacket A0, A1;
|
|
1261
1284
|
|
|
1262
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1263
|
-
{
|
|
1285
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1264
1286
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
|
|
1265
1287
|
RhsPacketx4 rhs_panel;
|
|
1266
1288
|
RhsPacket T0;
|
|
1267
1289
|
|
|
1268
|
-
internal::prefetch(blB+(48+0));
|
|
1290
|
+
internal::prefetch(blB + (48 + 0));
|
|
1269
1291
|
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1270
1292
|
peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1271
1293
|
peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1272
1294
|
peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1273
|
-
internal::prefetch(blB+(48+16));
|
|
1295
|
+
internal::prefetch(blB + (48 + 16));
|
|
1274
1296
|
peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1275
1297
|
peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1276
1298
|
peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1277
1299
|
peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
|
|
1278
1300
|
|
|
1279
|
-
blB += pk*4*RhsProgress;
|
|
1280
|
-
blA += pk*LhsProgress;
|
|
1301
|
+
blB += pk * 4 * RhsProgress;
|
|
1302
|
+
blA += pk * LhsProgress;
|
|
1281
1303
|
|
|
1282
1304
|
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
|
|
1283
1305
|
}
|
|
1284
|
-
C0 = padd(C0,D0);
|
|
1285
|
-
C1 = padd(C1,D1);
|
|
1286
|
-
C2 = padd(C2,D2);
|
|
1287
|
-
C3 = padd(C3,D3);
|
|
1306
|
+
C0 = padd(C0, D0);
|
|
1307
|
+
C1 = padd(C1, D1);
|
|
1308
|
+
C2 = padd(C2, D2);
|
|
1309
|
+
C3 = padd(C3, D3);
|
|
1288
1310
|
|
|
1289
1311
|
// process remaining peeled loop
|
|
1290
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1291
|
-
{
|
|
1312
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1292
1313
|
RhsPacketx4 rhs_panel;
|
|
1293
1314
|
RhsPacket T0;
|
|
1294
1315
|
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
|
|
1295
|
-
blB += 4*RhsProgress;
|
|
1316
|
+
blB += 4 * RhsProgress;
|
|
1296
1317
|
blA += LhsProgress;
|
|
1297
1318
|
}
|
|
1298
1319
|
|
|
@@ -1302,23 +1323,22 @@ struct lhs_process_one_packet
|
|
|
1302
1323
|
R0 = r0.template loadPacket<ResPacket>(0);
|
|
1303
1324
|
R1 = r1.template loadPacket<ResPacket>(0);
|
|
1304
1325
|
traits.acc(C0, alphav, R0);
|
|
1305
|
-
traits.acc(C1,
|
|
1326
|
+
traits.acc(C1, alphav, R1);
|
|
1306
1327
|
r0.storePacket(0, R0);
|
|
1307
1328
|
r1.storePacket(0, R1);
|
|
1308
1329
|
|
|
1309
1330
|
R0 = r2.template loadPacket<ResPacket>(0);
|
|
1310
1331
|
R1 = r3.template loadPacket<ResPacket>(0);
|
|
1311
|
-
traits.acc(C2,
|
|
1312
|
-
traits.acc(C3,
|
|
1332
|
+
traits.acc(C2, alphav, R0);
|
|
1333
|
+
traits.acc(C3, alphav, R1);
|
|
1313
1334
|
r2.storePacket(0, R0);
|
|
1314
1335
|
r3.storePacket(0, R1);
|
|
1315
1336
|
}
|
|
1316
1337
|
|
|
1317
1338
|
// Deal with remaining columns of the rhs
|
|
1318
|
-
for(Index j2=packet_cols4; j2<cols; j2++)
|
|
1319
|
-
{
|
|
1339
|
+
for (Index j2 = packet_cols4; j2 < cols; j2++) {
|
|
1320
1340
|
// One column at a time
|
|
1321
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
|
|
1341
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
|
|
1322
1342
|
prefetch(&blA[0]);
|
|
1323
1343
|
|
|
1324
1344
|
// gets res block as register
|
|
@@ -1328,24 +1348,23 @@ struct lhs_process_one_packet
|
|
|
1328
1348
|
LinearMapper r0 = res.getLinearMapper(i, j2);
|
|
1329
1349
|
|
|
1330
1350
|
// performs "inner" products
|
|
1331
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
1351
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
|
|
1332
1352
|
LhsPacket A0;
|
|
1333
1353
|
|
|
1334
|
-
for(Index k= 0; k<peeled_kc; k+=pk)
|
|
1335
|
-
{
|
|
1354
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1336
1355
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
|
|
1337
1356
|
RhsPacket B_0;
|
|
1338
1357
|
|
|
1339
|
-
#define EIGEN_GEBGP_ONESTEP(K)
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
/* FIXME: why unaligned???? */
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1358
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1359
|
+
do { \
|
|
1360
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
|
|
1361
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1362
|
+
/* FIXME: why unaligned???? */ \
|
|
1363
|
+
traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
|
|
1364
|
+
traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
|
|
1365
|
+
traits.madd(A0, B_0, C0, B_0, fix<0>); \
|
|
1366
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
|
|
1367
|
+
} while (false);
|
|
1349
1368
|
|
|
1350
1369
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1351
1370
|
EIGEN_GEBGP_ONESTEP(1);
|
|
@@ -1356,15 +1375,14 @@ struct lhs_process_one_packet
|
|
|
1356
1375
|
EIGEN_GEBGP_ONESTEP(6);
|
|
1357
1376
|
EIGEN_GEBGP_ONESTEP(7);
|
|
1358
1377
|
|
|
1359
|
-
blB += pk*RhsProgress;
|
|
1360
|
-
blA += pk*LhsProgress;
|
|
1378
|
+
blB += pk * RhsProgress;
|
|
1379
|
+
blA += pk * LhsProgress;
|
|
1361
1380
|
|
|
1362
1381
|
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
|
|
1363
1382
|
}
|
|
1364
1383
|
|
|
1365
1384
|
// process remaining peeled loop
|
|
1366
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1367
|
-
{
|
|
1385
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1368
1386
|
RhsPacket B_0;
|
|
1369
1387
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1370
1388
|
blB += RhsProgress;
|
|
@@ -1381,84 +1399,321 @@ struct lhs_process_one_packet
|
|
|
1381
1399
|
}
|
|
1382
1400
|
};
|
|
1383
1401
|
|
|
1384
|
-
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1402
|
+
template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
|
|
1403
|
+
typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
|
|
1404
|
+
typename LinearMapper, typename DataMapper>
|
|
1405
|
+
struct lhs_process_fraction_of_packet
|
|
1406
|
+
: lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
|
|
1407
|
+
RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
|
|
1408
|
+
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
|
|
1409
|
+
LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
|
|
1410
|
+
AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
|
|
1411
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
|
|
1412
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
|
|
1413
|
+
traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
|
|
1414
|
+
traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
|
|
1415
|
+
traits.madd(*A0, *B_0, *C0, *B_0);
|
|
1416
|
+
traits.madd(*A0, *B1, *C1, *B1);
|
|
1417
|
+
traits.madd(*A0, *B2, *C2, *B2);
|
|
1418
|
+
traits.madd(*A0, *B3, *C3, *B3);
|
|
1419
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
|
|
1399
1420
|
}
|
|
1400
1421
|
};
|
|
1401
1422
|
|
|
1402
|
-
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
1403
|
-
|
|
1404
|
-
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1423
|
+
template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
|
|
1424
|
+
bool ConjugateLhs, bool ConjugateRhs>
|
|
1425
|
+
EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs,
|
|
1426
|
+
ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
|
|
1427
|
+
const RhsScalar* blockB, Index rows, Index depth,
|
|
1428
|
+
Index cols, ResScalar alpha, Index strideA, Index strideB,
|
|
1429
|
+
Index offsetA, Index offsetB) {
|
|
1430
|
+
Traits traits;
|
|
1431
|
+
SwappedTraits straits;
|
|
1432
|
+
|
|
1433
|
+
if (strideA == -1) strideA = depth;
|
|
1434
|
+
if (strideB == -1) strideB = depth;
|
|
1435
|
+
conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
|
|
1436
|
+
Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
|
|
1437
|
+
Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
|
|
1438
|
+
const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
|
|
1439
|
+
const Index peeled_mc2 =
|
|
1440
|
+
mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
|
|
1441
|
+
const Index peeled_mc1 =
|
|
1442
|
+
mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
|
|
1443
|
+
const Index peeled_mc_half =
|
|
1444
|
+
mr >= LhsProgressHalf ? peeled_mc1 + ((rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
|
|
1445
|
+
const Index peeled_mc_quarter =
|
|
1446
|
+
mr >= LhsProgressQuarter
|
|
1447
|
+
? peeled_mc_half + ((rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
|
|
1448
|
+
: 0;
|
|
1449
|
+
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
|
|
1450
|
+
const Index peeled_kc = depth & ~(pk - 1);
|
|
1451
|
+
const int prefetch_res_offset = 32 / sizeof(ResScalar);
|
|
1452
|
+
// const Index depth2 = depth & ~1;
|
|
1453
|
+
|
|
1454
|
+
//---------- Process 3 * LhsProgress rows at once ----------
|
|
1455
|
+
// This corresponds to 3*LhsProgress x nr register blocks.
|
|
1456
|
+
// Usually, make sense only with FMA
|
|
1457
|
+
if (mr >= 3 * Traits::LhsProgress) {
|
|
1458
|
+
// Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
|
|
1459
|
+
// depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
|
|
1460
|
+
// computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
|
|
1461
|
+
// the number of rows of these horizontal panels. This actual number of rows is computed as follow:
|
|
1462
|
+
const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
|
|
1463
|
+
// The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
|
|
1464
|
+
// suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
|
|
1465
|
+
// guess), or because we are testing specific blocking sizes.
|
|
1466
|
+
const Index actual_panel_rows =
|
|
1467
|
+
(3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
|
|
1468
|
+
(depth * sizeof(LhsScalar) * 3 * LhsProgress)));
|
|
1469
|
+
for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
|
|
1470
|
+
const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
|
|
1471
|
+
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
|
|
1472
|
+
EIGEN_IF_CONSTEXPR(nr >= 8) {
|
|
1473
|
+
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
|
|
1474
|
+
for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
|
|
1475
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
|
|
1476
|
+
prefetch(&blA[0]);
|
|
1477
|
+
// gets res block as register
|
|
1478
|
+
AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
|
|
1479
|
+
C21, C22, C23;
|
|
1480
|
+
traits.initAcc(C0);
|
|
1481
|
+
traits.initAcc(C1);
|
|
1482
|
+
traits.initAcc(C2);
|
|
1483
|
+
traits.initAcc(C3);
|
|
1484
|
+
traits.initAcc(C4);
|
|
1485
|
+
traits.initAcc(C5);
|
|
1486
|
+
traits.initAcc(C6);
|
|
1487
|
+
traits.initAcc(C7);
|
|
1488
|
+
traits.initAcc(C8);
|
|
1489
|
+
traits.initAcc(C9);
|
|
1490
|
+
traits.initAcc(C10);
|
|
1491
|
+
traits.initAcc(C11);
|
|
1492
|
+
traits.initAcc(C12);
|
|
1493
|
+
traits.initAcc(C13);
|
|
1494
|
+
traits.initAcc(C14);
|
|
1495
|
+
traits.initAcc(C15);
|
|
1496
|
+
traits.initAcc(C16);
|
|
1497
|
+
traits.initAcc(C17);
|
|
1498
|
+
traits.initAcc(C18);
|
|
1499
|
+
traits.initAcc(C19);
|
|
1500
|
+
traits.initAcc(C20);
|
|
1501
|
+
traits.initAcc(C21);
|
|
1502
|
+
traits.initAcc(C22);
|
|
1503
|
+
traits.initAcc(C23);
|
|
1504
|
+
|
|
1505
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1506
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1507
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1508
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1509
|
+
LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
|
|
1510
|
+
LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
|
|
1511
|
+
LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
|
|
1512
|
+
LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
|
|
1513
|
+
|
|
1514
|
+
r0.prefetch(0);
|
|
1515
|
+
r1.prefetch(0);
|
|
1516
|
+
r2.prefetch(0);
|
|
1517
|
+
r3.prefetch(0);
|
|
1518
|
+
r4.prefetch(0);
|
|
1519
|
+
r5.prefetch(0);
|
|
1520
|
+
r6.prefetch(0);
|
|
1521
|
+
r7.prefetch(0);
|
|
1522
|
+
|
|
1523
|
+
// performs "inner" products
|
|
1524
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
|
|
1525
|
+
prefetch(&blB[0]);
|
|
1526
|
+
LhsPacket A0, A1;
|
|
1527
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1528
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
|
|
1529
|
+
// 27 registers are taken (24 for acc, 3 for lhs).
|
|
1530
|
+
RhsPanel27 rhs_panel;
|
|
1531
|
+
RhsPacket T0;
|
|
1532
|
+
LhsPacket A2;
|
|
1533
|
+
#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
|
|
1534
|
+
// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
|
|
1535
|
+
// without this workaround A0, A1, and A2 are loaded in the same register,
|
|
1536
|
+
// which is not good for pipelining
|
|
1537
|
+
#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
|
|
1538
|
+
#else
|
|
1539
|
+
#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
|
|
1540
|
+
#endif
|
|
1541
|
+
|
|
1542
|
+
#define EIGEN_GEBP_ONESTEP(K) \
|
|
1543
|
+
do { \
|
|
1544
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
|
|
1545
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1546
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1547
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1548
|
+
EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1549
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1550
|
+
traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
|
|
1551
|
+
traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
|
|
1552
|
+
traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1553
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1554
|
+
traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
|
|
1555
|
+
traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
|
|
1556
|
+
traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1557
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1558
|
+
traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
|
|
1559
|
+
traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
|
|
1560
|
+
traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1561
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1562
|
+
traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
|
|
1563
|
+
traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
|
|
1564
|
+
traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1565
|
+
traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
|
|
1566
|
+
traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
|
|
1567
|
+
traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
|
|
1568
|
+
traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1569
|
+
traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
|
|
1570
|
+
traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
|
|
1571
|
+
traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
|
|
1572
|
+
traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1573
|
+
traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
|
|
1574
|
+
traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
|
|
1575
|
+
traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
|
|
1576
|
+
traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1577
|
+
traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
|
|
1578
|
+
traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
|
|
1579
|
+
traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
|
|
1580
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
|
|
1581
|
+
} while (false)
|
|
1582
|
+
|
|
1583
|
+
EIGEN_GEBP_ONESTEP(0);
|
|
1584
|
+
EIGEN_GEBP_ONESTEP(1);
|
|
1585
|
+
EIGEN_GEBP_ONESTEP(2);
|
|
1586
|
+
EIGEN_GEBP_ONESTEP(3);
|
|
1587
|
+
EIGEN_GEBP_ONESTEP(4);
|
|
1588
|
+
EIGEN_GEBP_ONESTEP(5);
|
|
1589
|
+
EIGEN_GEBP_ONESTEP(6);
|
|
1590
|
+
EIGEN_GEBP_ONESTEP(7);
|
|
1591
|
+
|
|
1592
|
+
blB += pk * 8 * RhsProgress;
|
|
1593
|
+
blA += pk * 3 * Traits::LhsProgress;
|
|
1594
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
// process remaining peeled loop
|
|
1598
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1599
|
+
RhsPanel27 rhs_panel;
|
|
1600
|
+
RhsPacket T0;
|
|
1601
|
+
LhsPacket A2;
|
|
1602
|
+
EIGEN_GEBP_ONESTEP(0);
|
|
1603
|
+
blB += 8 * RhsProgress;
|
|
1604
|
+
blA += 3 * Traits::LhsProgress;
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1607
|
+
#undef EIGEN_GEBP_ONESTEP
|
|
1608
|
+
|
|
1609
|
+
ResPacket R0, R1, R2;
|
|
1610
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
1611
|
+
|
|
1612
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1613
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1614
|
+
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1615
|
+
traits.acc(C0, alphav, R0);
|
|
1616
|
+
traits.acc(C8, alphav, R1);
|
|
1617
|
+
traits.acc(C16, alphav, R2);
|
|
1618
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1619
|
+
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1620
|
+
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1621
|
+
|
|
1622
|
+
R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1623
|
+
R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1624
|
+
R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1625
|
+
traits.acc(C1, alphav, R0);
|
|
1626
|
+
traits.acc(C9, alphav, R1);
|
|
1627
|
+
traits.acc(C17, alphav, R2);
|
|
1628
|
+
r1.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1629
|
+
r1.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1630
|
+
r1.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1631
|
+
|
|
1632
|
+
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1633
|
+
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1634
|
+
R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1635
|
+
traits.acc(C2, alphav, R0);
|
|
1636
|
+
traits.acc(C10, alphav, R1);
|
|
1637
|
+
traits.acc(C18, alphav, R2);
|
|
1638
|
+
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1639
|
+
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1640
|
+
r2.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1641
|
+
|
|
1642
|
+
R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1643
|
+
R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1644
|
+
R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1645
|
+
traits.acc(C3, alphav, R0);
|
|
1646
|
+
traits.acc(C11, alphav, R1);
|
|
1647
|
+
traits.acc(C19, alphav, R2);
|
|
1648
|
+
r3.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1649
|
+
r3.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1650
|
+
r3.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1651
|
+
|
|
1652
|
+
R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1653
|
+
R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1654
|
+
R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1655
|
+
traits.acc(C4, alphav, R0);
|
|
1656
|
+
traits.acc(C12, alphav, R1);
|
|
1657
|
+
traits.acc(C20, alphav, R2);
|
|
1658
|
+
r4.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1659
|
+
r4.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1660
|
+
r4.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1661
|
+
|
|
1662
|
+
R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1663
|
+
R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1664
|
+
R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1665
|
+
traits.acc(C5, alphav, R0);
|
|
1666
|
+
traits.acc(C13, alphav, R1);
|
|
1667
|
+
traits.acc(C21, alphav, R2);
|
|
1668
|
+
r5.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1669
|
+
r5.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1670
|
+
r5.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1671
|
+
|
|
1672
|
+
R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1673
|
+
R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1674
|
+
R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1675
|
+
traits.acc(C6, alphav, R0);
|
|
1676
|
+
traits.acc(C14, alphav, R1);
|
|
1677
|
+
traits.acc(C22, alphav, R2);
|
|
1678
|
+
r6.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1679
|
+
r6.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1680
|
+
r6.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1681
|
+
|
|
1682
|
+
R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1683
|
+
R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1684
|
+
R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
|
|
1685
|
+
traits.acc(C7, alphav, R0);
|
|
1686
|
+
traits.acc(C15, alphav, R1);
|
|
1687
|
+
traits.acc(C23, alphav, R2);
|
|
1688
|
+
r7.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1689
|
+
r7.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1690
|
+
r7.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1691
|
+
}
|
|
1692
|
+
}
|
|
1693
|
+
}
|
|
1694
|
+
#endif
|
|
1695
|
+
for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
|
|
1696
|
+
for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
|
|
1449
1697
|
// We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
|
|
1450
1698
|
// stored into 3 x nr registers.
|
|
1451
|
-
|
|
1452
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
|
|
1699
|
+
|
|
1700
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
|
|
1453
1701
|
prefetch(&blA[0]);
|
|
1454
1702
|
|
|
1455
1703
|
// gets res block as register
|
|
1456
|
-
AccPacket C0, C1, C2,
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
traits.initAcc(
|
|
1460
|
-
traits.initAcc(
|
|
1461
|
-
traits.initAcc(
|
|
1704
|
+
AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
|
|
1705
|
+
traits.initAcc(C0);
|
|
1706
|
+
traits.initAcc(C1);
|
|
1707
|
+
traits.initAcc(C2);
|
|
1708
|
+
traits.initAcc(C3);
|
|
1709
|
+
traits.initAcc(C4);
|
|
1710
|
+
traits.initAcc(C5);
|
|
1711
|
+
traits.initAcc(C6);
|
|
1712
|
+
traits.initAcc(C7);
|
|
1713
|
+
traits.initAcc(C8);
|
|
1714
|
+
traits.initAcc(C9);
|
|
1715
|
+
traits.initAcc(C10);
|
|
1716
|
+
traits.initAcc(C11);
|
|
1462
1717
|
|
|
1463
1718
|
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1464
1719
|
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
@@ -1471,55 +1726,54 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1471
1726
|
r3.prefetch(0);
|
|
1472
1727
|
|
|
1473
1728
|
// performs "inner" products
|
|
1474
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB*
|
|
1729
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
|
|
1475
1730
|
prefetch(&blB[0]);
|
|
1476
1731
|
LhsPacket A0, A1;
|
|
1477
1732
|
|
|
1478
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1479
|
-
{
|
|
1733
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1480
1734
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
|
|
1481
|
-
// 15 registers are taken (12 for acc,
|
|
1735
|
+
// 15 registers are taken (12 for acc, 3 for lhs).
|
|
1482
1736
|
RhsPanel15 rhs_panel;
|
|
1483
1737
|
RhsPacket T0;
|
|
1484
1738
|
LhsPacket A2;
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
#define EIGEN_GEBP_ONESTEP(K)
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1739
|
+
#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
|
|
1740
|
+
// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
|
|
1741
|
+
// without this workaround A0, A1, and A2 are loaded in the same register,
|
|
1742
|
+
// which is not good for pipelining
|
|
1743
|
+
#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
|
|
1744
|
+
#else
|
|
1745
|
+
#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
|
|
1746
|
+
#endif
|
|
1747
|
+
#define EIGEN_GEBP_ONESTEP(K) \
|
|
1748
|
+
do { \
|
|
1749
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
|
|
1750
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1751
|
+
internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
|
|
1752
|
+
if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
|
|
1753
|
+
internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
|
|
1754
|
+
} /* Bug 953 */ \
|
|
1755
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1756
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1757
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1758
|
+
EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
|
|
1759
|
+
traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1760
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
1761
|
+
traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
|
|
1762
|
+
traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
|
|
1763
|
+
traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1764
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
1765
|
+
traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
|
|
1766
|
+
traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
|
|
1767
|
+
traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1768
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
1769
|
+
traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
|
|
1770
|
+
traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
|
|
1771
|
+
traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
|
|
1772
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
1773
|
+
traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
|
|
1774
|
+
traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
|
|
1775
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
|
|
1776
|
+
} while (false)
|
|
1523
1777
|
|
|
1524
1778
|
internal::prefetch(blB);
|
|
1525
1779
|
EIGEN_GEBP_ONESTEP(0);
|
|
@@ -1531,20 +1785,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1531
1785
|
EIGEN_GEBP_ONESTEP(6);
|
|
1532
1786
|
EIGEN_GEBP_ONESTEP(7);
|
|
1533
1787
|
|
|
1534
|
-
blB += pk*4*RhsProgress;
|
|
1535
|
-
blA += pk*3*Traits::LhsProgress;
|
|
1788
|
+
blB += pk * 4 * RhsProgress;
|
|
1789
|
+
blA += pk * 3 * Traits::LhsProgress;
|
|
1536
1790
|
|
|
1537
1791
|
EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
|
|
1538
1792
|
}
|
|
1539
1793
|
// process remaining peeled loop
|
|
1540
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1541
|
-
{
|
|
1794
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1542
1795
|
RhsPanel15 rhs_panel;
|
|
1543
1796
|
RhsPacket T0;
|
|
1544
1797
|
LhsPacket A2;
|
|
1545
1798
|
EIGEN_GEBP_ONESTEP(0);
|
|
1546
|
-
blB += 4*RhsProgress;
|
|
1547
|
-
blA += 3*Traits::LhsProgress;
|
|
1799
|
+
blB += 4 * RhsProgress;
|
|
1800
|
+
blA += 3 * Traits::LhsProgress;
|
|
1548
1801
|
}
|
|
1549
1802
|
|
|
1550
1803
|
#undef EIGEN_GEBP_ONESTEP
|
|
@@ -1590,17 +1843,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1590
1843
|
traits.acc(C11, alphav, R2);
|
|
1591
1844
|
r3.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1592
1845
|
r3.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1593
|
-
r3.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1594
|
-
}
|
|
1846
|
+
r3.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1595
1847
|
}
|
|
1848
|
+
}
|
|
1596
1849
|
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
{
|
|
1600
|
-
for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
|
|
1601
|
-
{
|
|
1850
|
+
// Deal with remaining columns of the rhs
|
|
1851
|
+
for (Index j2 = packet_cols4; j2 < cols; j2++) {
|
|
1852
|
+
for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
|
|
1602
1853
|
// One column at a time
|
|
1603
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
|
|
1854
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
|
|
1604
1855
|
prefetch(&blA[0]);
|
|
1605
1856
|
|
|
1606
1857
|
// gets res block as register
|
|
@@ -1613,26 +1864,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1613
1864
|
r0.prefetch(0);
|
|
1614
1865
|
|
|
1615
1866
|
// performs "inner" products
|
|
1616
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
1867
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
|
|
1617
1868
|
LhsPacket A0, A1, A2;
|
|
1618
|
-
|
|
1619
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1620
|
-
{
|
|
1869
|
+
|
|
1870
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1621
1871
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
|
|
1622
1872
|
RhsPacket B_0;
|
|
1623
|
-
#define EIGEN_GEBGP_ONESTEP(K)
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1873
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1874
|
+
do { \
|
|
1875
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
|
|
1876
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
1877
|
+
traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
|
|
1878
|
+
traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
|
|
1879
|
+
traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
|
|
1880
|
+
traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
|
|
1881
|
+
traits.madd(A0, B_0, C0, B_0, fix<0>); \
|
|
1882
|
+
traits.madd(A1, B_0, C4, B_0, fix<0>); \
|
|
1883
|
+
traits.madd(A2, B_0, C8, B_0, fix<0>); \
|
|
1884
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
|
|
1885
|
+
} while (false)
|
|
1636
1886
|
|
|
1637
1887
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1638
1888
|
EIGEN_GEBGP_ONESTEP(1);
|
|
@@ -1650,12 +1900,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1650
1900
|
}
|
|
1651
1901
|
|
|
1652
1902
|
// process remaining peeled loop
|
|
1653
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1654
|
-
{
|
|
1903
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1655
1904
|
RhsPacket B_0;
|
|
1656
1905
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1657
1906
|
blB += RhsProgress;
|
|
1658
|
-
blA += 3*Traits::LhsProgress;
|
|
1907
|
+
blA += 3 * Traits::LhsProgress;
|
|
1659
1908
|
}
|
|
1660
1909
|
#undef EIGEN_GEBGP_ONESTEP
|
|
1661
1910
|
ResPacket R0, R1, R2;
|
|
@@ -1669,40 +1918,214 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1669
1918
|
traits.acc(C8, alphav, R2);
|
|
1670
1919
|
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1671
1920
|
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1672
|
-
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1673
|
-
}
|
|
1921
|
+
r0.storePacket(2 * Traits::ResPacketSize, R2);
|
|
1674
1922
|
}
|
|
1675
1923
|
}
|
|
1676
1924
|
}
|
|
1925
|
+
}
|
|
1677
1926
|
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1927
|
+
//---------- Process 2 * LhsProgress rows at once ----------
|
|
1928
|
+
if (mr >= 2 * Traits::LhsProgress) {
|
|
1929
|
+
const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
|
|
1930
|
+
// The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
|
|
1931
|
+
// suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
|
|
1932
|
+
// guess), or because we are testing specific blocking sizes.
|
|
1933
|
+
Index actual_panel_rows =
|
|
1934
|
+
(2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
|
|
1935
|
+
(depth * sizeof(LhsScalar) * 2 * LhsProgress)));
|
|
1936
|
+
|
|
1937
|
+
for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
|
|
1938
|
+
Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
|
|
1939
|
+
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
|
|
1940
|
+
EIGEN_IF_CONSTEXPR(nr >= 8) {
|
|
1941
|
+
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
|
|
1942
|
+
for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
|
|
1943
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
|
|
1944
|
+
prefetch(&blA[0]);
|
|
1945
|
+
|
|
1946
|
+
AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
|
|
1947
|
+
traits.initAcc(C0);
|
|
1948
|
+
traits.initAcc(C1);
|
|
1949
|
+
traits.initAcc(C2);
|
|
1950
|
+
traits.initAcc(C3);
|
|
1951
|
+
traits.initAcc(C4);
|
|
1952
|
+
traits.initAcc(C5);
|
|
1953
|
+
traits.initAcc(C6);
|
|
1954
|
+
traits.initAcc(C7);
|
|
1955
|
+
traits.initAcc(C8);
|
|
1956
|
+
traits.initAcc(C9);
|
|
1957
|
+
traits.initAcc(C10);
|
|
1958
|
+
traits.initAcc(C11);
|
|
1959
|
+
traits.initAcc(C12);
|
|
1960
|
+
traits.initAcc(C13);
|
|
1961
|
+
traits.initAcc(C14);
|
|
1962
|
+
traits.initAcc(C15);
|
|
1963
|
+
|
|
1964
|
+
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1965
|
+
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
1966
|
+
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
|
|
1967
|
+
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
|
|
1968
|
+
LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
|
|
1969
|
+
LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
|
|
1970
|
+
LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
|
|
1971
|
+
LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
|
|
1972
|
+
r0.prefetch(prefetch_res_offset);
|
|
1973
|
+
r1.prefetch(prefetch_res_offset);
|
|
1974
|
+
r2.prefetch(prefetch_res_offset);
|
|
1975
|
+
r3.prefetch(prefetch_res_offset);
|
|
1976
|
+
r4.prefetch(prefetch_res_offset);
|
|
1977
|
+
r5.prefetch(prefetch_res_offset);
|
|
1978
|
+
r6.prefetch(prefetch_res_offset);
|
|
1979
|
+
r7.prefetch(prefetch_res_offset);
|
|
1980
|
+
|
|
1981
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
|
|
1982
|
+
prefetch(&blB[0]);
|
|
1983
|
+
LhsPacket A0, A1;
|
|
1984
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1985
|
+
RhsPacketx4 rhs_panel;
|
|
1986
|
+
RhsPacket T0;
|
|
1987
|
+
// NOTE: the begin/end asm comments below work around bug 935!
|
|
1988
|
+
// but they are not enough for gcc>=6 without FMA (bug 1637)
|
|
1989
|
+
#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
|
|
1990
|
+
#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
|
|
1991
|
+
#else
|
|
1992
|
+
#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
|
|
1993
|
+
#endif
|
|
1994
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
1995
|
+
do { \
|
|
1996
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
|
|
1997
|
+
traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
|
|
1998
|
+
traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
|
|
1999
|
+
traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2000
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
2001
|
+
traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
|
|
2002
|
+
traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2003
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
2004
|
+
traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
|
|
2005
|
+
traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2006
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
2007
|
+
traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
|
|
2008
|
+
traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2009
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
2010
|
+
traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
|
|
2011
|
+
traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2012
|
+
traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
|
|
2013
|
+
traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
|
|
2014
|
+
traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2015
|
+
traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
|
|
2016
|
+
traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
|
|
2017
|
+
traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2018
|
+
traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
|
|
2019
|
+
traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
|
|
2020
|
+
traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
|
|
2021
|
+
traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
|
|
2022
|
+
traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
|
|
2023
|
+
EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
|
|
2024
|
+
} while (false)
|
|
2025
|
+
|
|
2026
|
+
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
|
|
2027
|
+
|
|
2028
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
2029
|
+
EIGEN_GEBGP_ONESTEP(1);
|
|
2030
|
+
EIGEN_GEBGP_ONESTEP(2);
|
|
2031
|
+
EIGEN_GEBGP_ONESTEP(3);
|
|
2032
|
+
EIGEN_GEBGP_ONESTEP(4);
|
|
2033
|
+
EIGEN_GEBGP_ONESTEP(5);
|
|
2034
|
+
EIGEN_GEBGP_ONESTEP(6);
|
|
2035
|
+
EIGEN_GEBGP_ONESTEP(7);
|
|
2036
|
+
|
|
2037
|
+
blB += pk * 8 * RhsProgress;
|
|
2038
|
+
blA += pk * (2 * Traits::LhsProgress);
|
|
2039
|
+
|
|
2040
|
+
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
|
|
2041
|
+
}
|
|
2042
|
+
// process remaining peeled loop
|
|
2043
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
2044
|
+
RhsPacketx4 rhs_panel;
|
|
2045
|
+
RhsPacket T0;
|
|
2046
|
+
EIGEN_GEBGP_ONESTEP(0);
|
|
2047
|
+
blB += 8 * RhsProgress;
|
|
2048
|
+
blA += 2 * Traits::LhsProgress;
|
|
2049
|
+
}
|
|
1686
2050
|
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
2051
|
+
#undef EIGEN_GEBGP_ONESTEP
|
|
2052
|
+
|
|
2053
|
+
ResPacket R0, R1, R2, R3;
|
|
2054
|
+
ResPacket alphav = pset1<ResPacket>(alpha);
|
|
2055
|
+
|
|
2056
|
+
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2057
|
+
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2058
|
+
R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2059
|
+
R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2060
|
+
traits.acc(C0, alphav, R0);
|
|
2061
|
+
traits.acc(C8, alphav, R1);
|
|
2062
|
+
traits.acc(C1, alphav, R2);
|
|
2063
|
+
traits.acc(C9, alphav, R3);
|
|
2064
|
+
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
2065
|
+
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
|
2066
|
+
r1.storePacket(0 * Traits::ResPacketSize, R2);
|
|
2067
|
+
r1.storePacket(1 * Traits::ResPacketSize, R3);
|
|
2068
|
+
|
|
2069
|
+
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2070
|
+
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2071
|
+
R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2072
|
+
R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2073
|
+
traits.acc(C2, alphav, R0);
|
|
2074
|
+
traits.acc(C10, alphav, R1);
|
|
2075
|
+
traits.acc(C3, alphav, R2);
|
|
2076
|
+
traits.acc(C11, alphav, R3);
|
|
2077
|
+
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
|
2078
|
+
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
|
2079
|
+
r3.storePacket(0 * Traits::ResPacketSize, R2);
|
|
2080
|
+
r3.storePacket(1 * Traits::ResPacketSize, R3);
|
|
2081
|
+
|
|
2082
|
+
R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2083
|
+
R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2084
|
+
R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2085
|
+
R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2086
|
+
traits.acc(C4, alphav, R0);
|
|
2087
|
+
traits.acc(C12, alphav, R1);
|
|
2088
|
+
traits.acc(C5, alphav, R2);
|
|
2089
|
+
traits.acc(C13, alphav, R3);
|
|
2090
|
+
r4.storePacket(0 * Traits::ResPacketSize, R0);
|
|
2091
|
+
r4.storePacket(1 * Traits::ResPacketSize, R1);
|
|
2092
|
+
r5.storePacket(0 * Traits::ResPacketSize, R2);
|
|
2093
|
+
r5.storePacket(1 * Traits::ResPacketSize, R3);
|
|
2094
|
+
|
|
2095
|
+
R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2096
|
+
R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2097
|
+
R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
2098
|
+
R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
2099
|
+
traits.acc(C6, alphav, R0);
|
|
2100
|
+
traits.acc(C14, alphav, R1);
|
|
2101
|
+
traits.acc(C7, alphav, R2);
|
|
2102
|
+
traits.acc(C15, alphav, R3);
|
|
2103
|
+
r6.storePacket(0 * Traits::ResPacketSize, R0);
|
|
2104
|
+
r6.storePacket(1 * Traits::ResPacketSize, R1);
|
|
2105
|
+
r7.storePacket(0 * Traits::ResPacketSize, R2);
|
|
2106
|
+
r7.storePacket(1 * Traits::ResPacketSize, R3);
|
|
2107
|
+
}
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
2110
|
+
#endif
|
|
2111
|
+
for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
|
|
2112
|
+
for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
|
|
1695
2113
|
// We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
|
|
1696
2114
|
// stored into 2 x nr registers.
|
|
1697
|
-
|
|
1698
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
|
|
2115
|
+
|
|
2116
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
|
|
1699
2117
|
prefetch(&blA[0]);
|
|
1700
2118
|
|
|
1701
2119
|
// gets res block as register
|
|
1702
|
-
AccPacket C0, C1, C2, C3,
|
|
1703
|
-
|
|
1704
|
-
traits.initAcc(
|
|
1705
|
-
traits.initAcc(
|
|
2120
|
+
AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
|
|
2121
|
+
traits.initAcc(C0);
|
|
2122
|
+
traits.initAcc(C1);
|
|
2123
|
+
traits.initAcc(C2);
|
|
2124
|
+
traits.initAcc(C3);
|
|
2125
|
+
traits.initAcc(C4);
|
|
2126
|
+
traits.initAcc(C5);
|
|
2127
|
+
traits.initAcc(C6);
|
|
2128
|
+
traits.initAcc(C7);
|
|
1706
2129
|
|
|
1707
2130
|
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
|
|
1708
2131
|
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
|
|
@@ -1715,65 +2138,63 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1715
2138
|
r3.prefetch(prefetch_res_offset);
|
|
1716
2139
|
|
|
1717
2140
|
// performs "inner" products
|
|
1718
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB*
|
|
2141
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
|
|
1719
2142
|
prefetch(&blB[0]);
|
|
1720
2143
|
LhsPacket A0, A1;
|
|
1721
2144
|
|
|
1722
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1723
|
-
{
|
|
2145
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1724
2146
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
|
|
1725
2147
|
RhsPacketx4 rhs_panel;
|
|
1726
2148
|
RhsPacket T0;
|
|
1727
2149
|
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
#define EIGEN_GEBGP_ONESTEP(K)
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
internal::prefetch(blB+(48+0));
|
|
2150
|
+
// NOTE: the begin/end asm comments below work around bug 935!
|
|
2151
|
+
// but they are not enough for gcc>=6 without FMA (bug 1637)
|
|
2152
|
+
#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
|
|
2153
|
+
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
|
|
2154
|
+
#else
|
|
2155
|
+
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
|
|
2156
|
+
#endif
|
|
2157
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
2158
|
+
do { \
|
|
2159
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
|
|
2160
|
+
traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
|
|
2161
|
+
traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
|
|
2162
|
+
traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
|
|
2163
|
+
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
|
|
2164
|
+
traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
|
|
2165
|
+
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
|
|
2166
|
+
traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
|
|
2167
|
+
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
|
|
2168
|
+
traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
|
|
2169
|
+
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
|
|
2170
|
+
traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
|
|
2171
|
+
EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
|
|
2172
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
|
|
2173
|
+
} while (false)
|
|
2174
|
+
|
|
2175
|
+
internal::prefetch(blB + (48 + 0));
|
|
1754
2176
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1755
2177
|
EIGEN_GEBGP_ONESTEP(1);
|
|
1756
2178
|
EIGEN_GEBGP_ONESTEP(2);
|
|
1757
2179
|
EIGEN_GEBGP_ONESTEP(3);
|
|
1758
|
-
internal::prefetch(blB+(48+16));
|
|
2180
|
+
internal::prefetch(blB + (48 + 16));
|
|
1759
2181
|
EIGEN_GEBGP_ONESTEP(4);
|
|
1760
2182
|
EIGEN_GEBGP_ONESTEP(5);
|
|
1761
2183
|
EIGEN_GEBGP_ONESTEP(6);
|
|
1762
2184
|
EIGEN_GEBGP_ONESTEP(7);
|
|
1763
2185
|
|
|
1764
|
-
blB += pk*4*RhsProgress;
|
|
1765
|
-
blA += pk*(2*Traits::LhsProgress);
|
|
2186
|
+
blB += pk * 4 * RhsProgress;
|
|
2187
|
+
blA += pk * (2 * Traits::LhsProgress);
|
|
1766
2188
|
|
|
1767
2189
|
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
|
|
1768
2190
|
}
|
|
1769
2191
|
// process remaining peeled loop
|
|
1770
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1771
|
-
{
|
|
2192
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1772
2193
|
RhsPacketx4 rhs_panel;
|
|
1773
2194
|
RhsPacket T0;
|
|
1774
2195
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1775
|
-
blB += 4*RhsProgress;
|
|
1776
|
-
blA += 2*Traits::LhsProgress;
|
|
2196
|
+
blB += 4 * RhsProgress;
|
|
2197
|
+
blA += 2 * Traits::LhsProgress;
|
|
1777
2198
|
}
|
|
1778
2199
|
#undef EIGEN_GEBGP_ONESTEP
|
|
1779
2200
|
|
|
@@ -1797,24 +2218,22 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1797
2218
|
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1798
2219
|
R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
|
|
1799
2220
|
R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
|
|
1800
|
-
traits.acc(C2,
|
|
1801
|
-
traits.acc(C6,
|
|
1802
|
-
traits.acc(C3,
|
|
1803
|
-
traits.acc(C7,
|
|
2221
|
+
traits.acc(C2, alphav, R0);
|
|
2222
|
+
traits.acc(C6, alphav, R1);
|
|
2223
|
+
traits.acc(C3, alphav, R2);
|
|
2224
|
+
traits.acc(C7, alphav, R3);
|
|
1804
2225
|
r2.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1805
2226
|
r2.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1806
2227
|
r3.storePacket(0 * Traits::ResPacketSize, R2);
|
|
1807
2228
|
r3.storePacket(1 * Traits::ResPacketSize, R3);
|
|
1808
|
-
}
|
|
1809
2229
|
}
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
{
|
|
2230
|
+
}
|
|
2231
|
+
|
|
2232
|
+
// Deal with remaining columns of the rhs
|
|
2233
|
+
for (Index j2 = packet_cols4; j2 < cols; j2++) {
|
|
2234
|
+
for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
|
|
1816
2235
|
// One column at a time
|
|
1817
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
|
|
2236
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
|
|
1818
2237
|
prefetch(&blA[0]);
|
|
1819
2238
|
|
|
1820
2239
|
// gets res block as register
|
|
@@ -1826,26 +2245,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1826
2245
|
r0.prefetch(prefetch_res_offset);
|
|
1827
2246
|
|
|
1828
2247
|
// performs "inner" products
|
|
1829
|
-
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
2248
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
|
|
1830
2249
|
LhsPacket A0, A1;
|
|
1831
2250
|
|
|
1832
|
-
for(Index k=0; k<peeled_kc; k+=pk)
|
|
1833
|
-
{
|
|
2251
|
+
for (Index k = 0; k < peeled_kc; k += pk) {
|
|
1834
2252
|
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
|
|
1835
2253
|
RhsPacket B_0, B1;
|
|
1836
|
-
|
|
1837
|
-
#define EIGEN_GEBGP_ONESTEP(K)
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
2254
|
+
|
|
2255
|
+
#define EIGEN_GEBGP_ONESTEP(K) \
|
|
2256
|
+
do { \
|
|
2257
|
+
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
|
|
2258
|
+
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
|
|
2259
|
+
traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
|
|
2260
|
+
traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
|
|
2261
|
+
traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
|
|
2262
|
+
traits.madd(A0, B_0, C0, B1, fix<0>); \
|
|
2263
|
+
traits.madd(A1, B_0, C4, B_0, fix<0>); \
|
|
2264
|
+
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
|
|
2265
|
+
} while (false)
|
|
2266
|
+
|
|
1849
2267
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1850
2268
|
EIGEN_GEBGP_ONESTEP(1);
|
|
1851
2269
|
EIGEN_GEBGP_ONESTEP(2);
|
|
@@ -1862,12 +2280,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1862
2280
|
}
|
|
1863
2281
|
|
|
1864
2282
|
// process remaining peeled loop
|
|
1865
|
-
for(Index k=peeled_kc; k<depth; k++)
|
|
1866
|
-
{
|
|
2283
|
+
for (Index k = peeled_kc; k < depth; k++) {
|
|
1867
2284
|
RhsPacket B_0, B1;
|
|
1868
2285
|
EIGEN_GEBGP_ONESTEP(0);
|
|
1869
2286
|
blB += RhsProgress;
|
|
1870
|
-
blA += 2*Traits::LhsProgress;
|
|
2287
|
+
blA += 2 * Traits::LhsProgress;
|
|
1871
2288
|
}
|
|
1872
2289
|
#undef EIGEN_GEBGP_ONESTEP
|
|
1873
2290
|
ResPacket R0, R1;
|
|
@@ -1879,197 +2296,252 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
1879
2296
|
traits.acc(C4, alphav, R1);
|
|
1880
2297
|
r0.storePacket(0 * Traits::ResPacketSize, R0);
|
|
1881
2298
|
r0.storePacket(1 * Traits::ResPacketSize, R1);
|
|
1882
|
-
}
|
|
1883
2299
|
}
|
|
1884
2300
|
}
|
|
1885
2301
|
}
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
2302
|
+
}
|
|
2303
|
+
//---------- Process 1 * LhsProgress rows at once ----------
|
|
2304
|
+
if (mr >= 1 * Traits::LhsProgress) {
|
|
2305
|
+
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
|
|
2306
|
+
RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
|
|
2307
|
+
p;
|
|
2308
|
+
p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
|
|
2309
|
+
peeled_kc, pk, cols, depth, packet_cols4);
|
|
2310
|
+
}
|
|
2311
|
+
//---------- Process LhsProgressHalf rows at once ----------
|
|
2312
|
+
if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
|
|
2313
|
+
lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
|
|
2314
|
+
LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
|
|
2315
|
+
p;
|
|
2316
|
+
p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
|
|
2317
|
+
peeled_kc, pk, cols, depth, packet_cols4);
|
|
2318
|
+
}
|
|
2319
|
+
//---------- Process LhsProgressQuarter rows at once ----------
|
|
2320
|
+
if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
|
|
2321
|
+
lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
|
|
2322
|
+
AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter,
|
|
2323
|
+
QuarterTraits, LinearMapper, DataMapper>
|
|
2324
|
+
p;
|
|
2325
|
+
p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
|
|
2326
|
+
prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
|
|
2327
|
+
}
|
|
2328
|
+
//---------- Process remaining rows, 1 at once ----------
|
|
2329
|
+
if (peeled_mc_quarter < rows) {
|
|
2330
|
+
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
|
|
2331
|
+
EIGEN_IF_CONSTEXPR(nr >= 8) {
|
|
1907
2332
|
// loop on each panel of the rhs
|
|
1908
|
-
for(Index j2=0; j2<
|
|
1909
|
-
{
|
|
2333
|
+
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
|
|
1910
2334
|
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
1911
|
-
for(Index i=peeled_mc_quarter; i<rows; i+=1)
|
|
1912
|
-
|
|
1913
|
-
const LhsScalar* blA = &blockA[i*strideA+offsetA];
|
|
2335
|
+
for (Index i = peeled_mc_quarter; i < rows; i += 1) {
|
|
2336
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA];
|
|
1914
2337
|
prefetch(&blA[0]);
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
|
|
1922
|
-
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
|
1923
|
-
(SwappedTraits::LhsProgress<=16) &&
|
|
1924
|
-
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
|
|
1925
|
-
(SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
|
|
1926
|
-
{
|
|
1927
|
-
SAccPacket C0, C1, C2, C3;
|
|
1928
|
-
straits.initAcc(C0);
|
|
1929
|
-
straits.initAcc(C1);
|
|
1930
|
-
straits.initAcc(C2);
|
|
1931
|
-
straits.initAcc(C3);
|
|
1932
|
-
|
|
1933
|
-
const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
|
|
1934
|
-
const Index endk = (depth/spk)*spk;
|
|
1935
|
-
const Index endk4 = (depth/(spk*4))*(spk*4);
|
|
1936
|
-
|
|
1937
|
-
Index k=0;
|
|
1938
|
-
for(; k<endk4; k+=4*spk)
|
|
1939
|
-
{
|
|
1940
|
-
SLhsPacket A0,A1;
|
|
1941
|
-
SRhsPacket B_0,B_1;
|
|
1942
|
-
|
|
1943
|
-
straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
|
|
1944
|
-
straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
|
|
1945
|
-
|
|
1946
|
-
straits.loadRhsQuad(blA+0*spk, B_0);
|
|
1947
|
-
straits.loadRhsQuad(blA+1*spk, B_1);
|
|
1948
|
-
straits.madd(A0,B_0,C0,B_0, fix<0>);
|
|
1949
|
-
straits.madd(A1,B_1,C1,B_1, fix<0>);
|
|
1950
|
-
|
|
1951
|
-
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
|
|
1952
|
-
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
|
|
1953
|
-
straits.loadRhsQuad(blA+2*spk, B_0);
|
|
1954
|
-
straits.loadRhsQuad(blA+3*spk, B_1);
|
|
1955
|
-
straits.madd(A0,B_0,C2,B_0, fix<0>);
|
|
1956
|
-
straits.madd(A1,B_1,C3,B_1, fix<0>);
|
|
1957
|
-
|
|
1958
|
-
blB += 4*SwappedTraits::LhsProgress;
|
|
1959
|
-
blA += 4*spk;
|
|
1960
|
-
}
|
|
1961
|
-
C0 = padd(padd(C0,C1),padd(C2,C3));
|
|
1962
|
-
for(; k<endk; k+=spk)
|
|
1963
|
-
{
|
|
1964
|
-
SLhsPacket A0;
|
|
1965
|
-
SRhsPacket B_0;
|
|
1966
|
-
|
|
1967
|
-
straits.loadLhsUnaligned(blB, A0);
|
|
1968
|
-
straits.loadRhsQuad(blA, B_0);
|
|
1969
|
-
straits.madd(A0,B_0,C0,B_0, fix<0>);
|
|
1970
|
-
|
|
1971
|
-
blB += SwappedTraits::LhsProgress;
|
|
1972
|
-
blA += spk;
|
|
1973
|
-
}
|
|
1974
|
-
if(SwappedTraits::LhsProgress==8)
|
|
1975
|
-
{
|
|
1976
|
-
// Special case where we have to first reduce the accumulation register C0
|
|
1977
|
-
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
|
|
1978
|
-
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
|
|
1979
|
-
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
|
|
1980
|
-
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
|
|
1981
|
-
|
|
1982
|
-
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
|
|
1983
|
-
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
|
|
1984
|
-
|
|
1985
|
-
if(depth-endk>0)
|
|
1986
|
-
{
|
|
1987
|
-
// We have to handle the last row of the rhs which corresponds to a half-packet
|
|
1988
|
-
SLhsPacketHalf a0;
|
|
1989
|
-
SRhsPacketHalf b0;
|
|
1990
|
-
straits.loadLhsUnaligned(blB, a0);
|
|
1991
|
-
straits.loadRhs(blA, b0);
|
|
1992
|
-
SAccPacketHalf c0 = predux_half_dowto4(C0);
|
|
1993
|
-
straits.madd(a0,b0,c0,b0, fix<0>);
|
|
1994
|
-
straits.acc(c0, alphav, R);
|
|
1995
|
-
}
|
|
1996
|
-
else
|
|
1997
|
-
{
|
|
1998
|
-
straits.acc(predux_half_dowto4(C0), alphav, R);
|
|
1999
|
-
}
|
|
2000
|
-
res.scatterPacket(i, j2, R);
|
|
2001
|
-
}
|
|
2002
|
-
else if (SwappedTraits::LhsProgress==16)
|
|
2003
|
-
{
|
|
2004
|
-
// Special case where we have to first reduce the
|
|
2005
|
-
// accumulation register C0. We specialize the block in
|
|
2006
|
-
// template form, so that LhsProgress < 16 paths don't
|
|
2007
|
-
// fail to compile
|
|
2008
|
-
last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
|
|
2009
|
-
p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
|
|
2010
|
-
}
|
|
2011
|
-
else
|
|
2012
|
-
{
|
|
2013
|
-
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
|
|
2014
|
-
SResPacket alphav = pset1<SResPacket>(alpha);
|
|
2015
|
-
straits.acc(C0, alphav, R);
|
|
2016
|
-
res.scatterPacket(i, j2, R);
|
|
2017
|
-
}
|
|
2018
|
-
}
|
|
2019
|
-
else // scalar path
|
|
2020
|
-
{
|
|
2021
|
-
// get a 1 x 4 res block as registers
|
|
2022
|
-
ResScalar C0(0), C1(0), C2(0), C3(0);
|
|
2338
|
+
// gets a 1 x 1 res block as registers
|
|
2339
|
+
ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
|
|
2340
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
|
|
2341
|
+
for (Index k = 0; k < depth; k++) {
|
|
2342
|
+
LhsScalar A0 = blA[k];
|
|
2343
|
+
RhsScalar B_0;
|
|
2023
2344
|
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
LhsScalar A0;
|
|
2027
|
-
RhsScalar B_0, B_1;
|
|
2345
|
+
B_0 = blB[0];
|
|
2346
|
+
C0 = cj.pmadd(A0, B_0, C0);
|
|
2028
2347
|
|
|
2029
|
-
|
|
2348
|
+
B_0 = blB[1];
|
|
2349
|
+
C1 = cj.pmadd(A0, B_0, C1);
|
|
2030
2350
|
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
C0 = cj.pmadd(A0,B_0,C0);
|
|
2034
|
-
C1 = cj.pmadd(A0,B_1,C1);
|
|
2351
|
+
B_0 = blB[2];
|
|
2352
|
+
C2 = cj.pmadd(A0, B_0, C2);
|
|
2035
2353
|
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
C2 = cj.pmadd(A0,B_0,C2);
|
|
2039
|
-
C3 = cj.pmadd(A0,B_1,C3);
|
|
2354
|
+
B_0 = blB[3];
|
|
2355
|
+
C3 = cj.pmadd(A0, B_0, C3);
|
|
2040
2356
|
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2357
|
+
B_0 = blB[4];
|
|
2358
|
+
C4 = cj.pmadd(A0, B_0, C4);
|
|
2359
|
+
|
|
2360
|
+
B_0 = blB[5];
|
|
2361
|
+
C5 = cj.pmadd(A0, B_0, C5);
|
|
2362
|
+
|
|
2363
|
+
B_0 = blB[6];
|
|
2364
|
+
C6 = cj.pmadd(A0, B_0, C6);
|
|
2365
|
+
|
|
2366
|
+
B_0 = blB[7];
|
|
2367
|
+
C7 = cj.pmadd(A0, B_0, C7);
|
|
2368
|
+
|
|
2369
|
+
blB += 8;
|
|
2047
2370
|
}
|
|
2371
|
+
res(i, j2 + 0) += alpha * C0;
|
|
2372
|
+
res(i, j2 + 1) += alpha * C1;
|
|
2373
|
+
res(i, j2 + 2) += alpha * C2;
|
|
2374
|
+
res(i, j2 + 3) += alpha * C3;
|
|
2375
|
+
res(i, j2 + 4) += alpha * C4;
|
|
2376
|
+
res(i, j2 + 5) += alpha * C5;
|
|
2377
|
+
res(i, j2 + 6) += alpha * C6;
|
|
2378
|
+
res(i, j2 + 7) += alpha * C7;
|
|
2048
2379
|
}
|
|
2049
2380
|
}
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2381
|
+
}
|
|
2382
|
+
#endif
|
|
2383
|
+
|
|
2384
|
+
for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
|
|
2385
|
+
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
2386
|
+
for (Index i = peeled_mc_quarter; i < rows; i += 1) {
|
|
2387
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA];
|
|
2388
|
+
prefetch(&blA[0]);
|
|
2389
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
|
|
2390
|
+
|
|
2391
|
+
// If LhsProgress is 8 or 16, it assumes that there is a
|
|
2392
|
+
// half or quarter packet, respectively, of the same size as
|
|
2393
|
+
// nr (which is currently 4) for the return type.
|
|
2394
|
+
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
|
2395
|
+
const int SResPacketQuarterSize =
|
|
2396
|
+
unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
|
|
2397
|
+
// The following code assumes we can load SRhsPacket in such a way that
|
|
2398
|
+
// it multiplies blocks of 4 elements in SLhsPacket. This is not the
|
|
2399
|
+
// case for some customized kernels (i.e. NEON fp16). If the assumption
|
|
2400
|
+
// fails, drop down to the scalar path.
|
|
2401
|
+
constexpr bool kCanLoadSRhsQuad =
|
|
2402
|
+
(unpacket_traits<SLhsPacket>::size < 4) ||
|
|
2403
|
+
(unpacket_traits<SRhsPacket>::size % ((std::max<int>)(unpacket_traits<SLhsPacket>::size, 4) / 4)) == 0;
|
|
2404
|
+
if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
|
|
2405
|
+
(SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
|
|
2406
|
+
(SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
|
|
2407
|
+
SAccPacket C0, C1, C2, C3;
|
|
2408
|
+
straits.initAcc(C0);
|
|
2409
|
+
straits.initAcc(C1);
|
|
2410
|
+
straits.initAcc(C2);
|
|
2411
|
+
straits.initAcc(C3);
|
|
2412
|
+
|
|
2413
|
+
const Index spk = (std::max)(1, SwappedTraits::LhsProgress / 4);
|
|
2414
|
+
const Index endk = (depth / spk) * spk;
|
|
2415
|
+
const Index endk4 = (depth / (spk * 4)) * (spk * 4);
|
|
2416
|
+
|
|
2417
|
+
Index k = 0;
|
|
2418
|
+
for (; k < endk4; k += 4 * spk) {
|
|
2419
|
+
SLhsPacket A0, A1;
|
|
2420
|
+
SRhsPacket B_0, B_1;
|
|
2421
|
+
|
|
2422
|
+
straits.loadLhsUnaligned(blB + 0 * SwappedTraits::LhsProgress, A0);
|
|
2423
|
+
straits.loadLhsUnaligned(blB + 1 * SwappedTraits::LhsProgress, A1);
|
|
2424
|
+
|
|
2425
|
+
straits.loadRhsQuad(blA + 0 * spk, B_0);
|
|
2426
|
+
straits.loadRhsQuad(blA + 1 * spk, B_1);
|
|
2427
|
+
straits.madd(A0, B_0, C0, B_0, fix<0>);
|
|
2428
|
+
straits.madd(A1, B_1, C1, B_1, fix<0>);
|
|
2429
|
+
|
|
2430
|
+
straits.loadLhsUnaligned(blB + 2 * SwappedTraits::LhsProgress, A0);
|
|
2431
|
+
straits.loadLhsUnaligned(blB + 3 * SwappedTraits::LhsProgress, A1);
|
|
2432
|
+
straits.loadRhsQuad(blA + 2 * spk, B_0);
|
|
2433
|
+
straits.loadRhsQuad(blA + 3 * spk, B_1);
|
|
2434
|
+
straits.madd(A0, B_0, C2, B_0, fix<0>);
|
|
2435
|
+
straits.madd(A1, B_1, C3, B_1, fix<0>);
|
|
2436
|
+
|
|
2437
|
+
blB += 4 * SwappedTraits::LhsProgress;
|
|
2438
|
+
blA += 4 * spk;
|
|
2439
|
+
}
|
|
2440
|
+
C0 = padd(padd(C0, C1), padd(C2, C3));
|
|
2441
|
+
for (; k < endk; k += spk) {
|
|
2442
|
+
SLhsPacket A0;
|
|
2443
|
+
SRhsPacket B_0;
|
|
2444
|
+
|
|
2445
|
+
straits.loadLhsUnaligned(blB, A0);
|
|
2446
|
+
straits.loadRhsQuad(blA, B_0);
|
|
2447
|
+
straits.madd(A0, B_0, C0, B_0, fix<0>);
|
|
2448
|
+
|
|
2449
|
+
blB += SwappedTraits::LhsProgress;
|
|
2450
|
+
blA += spk;
|
|
2451
|
+
}
|
|
2452
|
+
if (SwappedTraits::LhsProgress == 8) {
|
|
2453
|
+
// Special case where we have to first reduce the accumulation register C0
|
|
2454
|
+
typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SResPacket>::half,
|
|
2455
|
+
SResPacket>
|
|
2456
|
+
SResPacketHalf;
|
|
2457
|
+
typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SLhsPacket>::half,
|
|
2458
|
+
SLhsPacket>
|
|
2459
|
+
SLhsPacketHalf;
|
|
2460
|
+
typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SRhsPacket>::half,
|
|
2461
|
+
SRhsPacket>
|
|
2462
|
+
SRhsPacketHalf;
|
|
2463
|
+
typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SAccPacket>::half,
|
|
2464
|
+
SAccPacket>
|
|
2465
|
+
SAccPacketHalf;
|
|
2466
|
+
|
|
2467
|
+
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
|
|
2468
|
+
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
|
|
2469
|
+
|
|
2470
|
+
if (depth - endk > 0) {
|
|
2471
|
+
// We have to handle the last row of the rhs which corresponds to a half-packet
|
|
2472
|
+
SLhsPacketHalf a0;
|
|
2473
|
+
SRhsPacketHalf b0;
|
|
2474
|
+
straits.loadLhsUnaligned(blB, a0);
|
|
2475
|
+
straits.loadRhs(blA, b0);
|
|
2476
|
+
SAccPacketHalf c0 = predux_half_dowto4(C0);
|
|
2477
|
+
straits.madd(a0, b0, c0, b0, fix<0>);
|
|
2478
|
+
straits.acc(c0, alphav, R);
|
|
2479
|
+
} else {
|
|
2480
|
+
straits.acc(predux_half_dowto4(C0), alphav, R);
|
|
2481
|
+
}
|
|
2482
|
+
res.scatterPacket(i, j2, R);
|
|
2483
|
+
} else if (SwappedTraits::LhsProgress == 16) {
|
|
2484
|
+
// Special case where we have to first reduce the
|
|
2485
|
+
// accumulation register C0. We specialize the block in
|
|
2486
|
+
// template form, so that LhsProgress < 16 paths don't
|
|
2487
|
+
// fail to compile
|
|
2488
|
+
last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
|
|
2489
|
+
p(res, straits, blA, blB, depth, endk, i, j2, alpha, C0);
|
|
2490
|
+
} else {
|
|
2491
|
+
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
|
|
2492
|
+
SResPacket alphav = pset1<SResPacket>(alpha);
|
|
2493
|
+
straits.acc(C0, alphav, R);
|
|
2494
|
+
res.scatterPacket(i, j2, R);
|
|
2495
|
+
}
|
|
2496
|
+
} else // scalar path
|
|
2055
2497
|
{
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2498
|
+
// get a 1 x 4 res block as registers
|
|
2499
|
+
ResScalar C0(0), C1(0), C2(0), C3(0);
|
|
2500
|
+
|
|
2501
|
+
for (Index k = 0; k < depth; k++) {
|
|
2502
|
+
LhsScalar A0;
|
|
2503
|
+
RhsScalar B_0, B_1;
|
|
2504
|
+
|
|
2505
|
+
A0 = blA[k];
|
|
2506
|
+
|
|
2507
|
+
B_0 = blB[0];
|
|
2508
|
+
B_1 = blB[1];
|
|
2065
2509
|
C0 = cj.pmadd(A0, B_0, C0);
|
|
2510
|
+
C1 = cj.pmadd(A0, B_1, C1);
|
|
2511
|
+
|
|
2512
|
+
B_0 = blB[2];
|
|
2513
|
+
B_1 = blB[3];
|
|
2514
|
+
C2 = cj.pmadd(A0, B_0, C2);
|
|
2515
|
+
C3 = cj.pmadd(A0, B_1, C3);
|
|
2516
|
+
|
|
2517
|
+
blB += 4;
|
|
2066
2518
|
}
|
|
2067
|
-
res(i, j2) += alpha * C0;
|
|
2519
|
+
res(i, j2 + 0) += alpha * C0;
|
|
2520
|
+
res(i, j2 + 1) += alpha * C1;
|
|
2521
|
+
res(i, j2 + 2) += alpha * C2;
|
|
2522
|
+
res(i, j2 + 3) += alpha * C3;
|
|
2068
2523
|
}
|
|
2069
2524
|
}
|
|
2070
2525
|
}
|
|
2526
|
+
// remaining columns
|
|
2527
|
+
for (Index j2 = packet_cols4; j2 < cols; j2++) {
|
|
2528
|
+
// loop on each row of the lhs (1*LhsProgress x depth)
|
|
2529
|
+
for (Index i = peeled_mc_quarter; i < rows; i += 1) {
|
|
2530
|
+
const LhsScalar* blA = &blockA[i * strideA + offsetA];
|
|
2531
|
+
prefetch(&blA[0]);
|
|
2532
|
+
// gets a 1 x 1 res block as registers
|
|
2533
|
+
ResScalar C0(0);
|
|
2534
|
+
const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
|
|
2535
|
+
for (Index k = 0; k < depth; k++) {
|
|
2536
|
+
LhsScalar A0 = blA[k];
|
|
2537
|
+
RhsScalar B_0 = blB[k];
|
|
2538
|
+
C0 = cj.pmadd(A0, B_0, C0);
|
|
2539
|
+
}
|
|
2540
|
+
res(i, j2) += alpha * C0;
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2071
2543
|
}
|
|
2072
|
-
|
|
2544
|
+
}
|
|
2073
2545
|
|
|
2074
2546
|
// pack a block of the lhs
|
|
2075
2547
|
// The traversal is as follow (mr==4):
|
|
@@ -2085,131 +2557,129 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
|
|
2085
2557
|
//
|
|
2086
2558
|
// 32 33 34 35 ...
|
|
2087
2559
|
// 36 36 38 39 ...
|
|
2088
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2089
|
-
|
|
2090
|
-
{
|
|
2560
|
+
template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2561
|
+
bool PanelMode>
|
|
2562
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
|
|
2091
2563
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
2092
|
-
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0,
|
|
2564
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
|
|
2565
|
+
Index offset = 0);
|
|
2093
2566
|
};
|
|
2094
2567
|
|
|
2095
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2568
|
+
template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2569
|
+
bool PanelMode>
|
|
2570
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
|
|
2571
|
+
PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
|
|
2572
|
+
Index rows, Index stride, Index offset) {
|
|
2099
2573
|
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2100
2574
|
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2101
|
-
enum {
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2575
|
+
enum {
|
|
2576
|
+
PacketSize = unpacket_traits<Packet>::size,
|
|
2577
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2578
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
2579
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
2580
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
|
|
2581
|
+
};
|
|
2106
2582
|
|
|
2107
2583
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
|
2108
2584
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
2109
2585
|
EIGEN_UNUSED_VARIABLE(offset);
|
|
2110
|
-
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2111
|
-
eigen_assert(
|
|
2586
|
+
eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
|
|
2587
|
+
eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
|
|
2112
2588
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2113
2589
|
Index count = 0;
|
|
2114
2590
|
|
|
2115
|
-
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
|
2116
|
-
const Index peeled_mc2 =
|
|
2117
|
-
|
|
2118
|
-
const Index
|
|
2119
|
-
|
|
2591
|
+
const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
|
|
2592
|
+
const Index peeled_mc2 =
|
|
2593
|
+
Pack1 >= 2 * PacketSize ? peeled_mc3 + ((rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
|
|
2594
|
+
const Index peeled_mc1 =
|
|
2595
|
+
Pack1 >= 1 * PacketSize ? peeled_mc2 + ((rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
|
|
2596
|
+
const Index peeled_mc_half =
|
|
2597
|
+
Pack1 >= HalfPacketSize ? peeled_mc1 + ((rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
|
|
2598
|
+
const Index peeled_mc_quarter = Pack1 >= QuarterPacketSize ? (rows / (QuarterPacketSize)) * (QuarterPacketSize) : 0;
|
|
2120
2599
|
const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
|
|
2121
|
-
const Index peeled_mc0 = Pack2>=PacketSize
|
|
2122
|
-
|
|
2600
|
+
const Index peeled_mc0 = Pack2 >= PacketSize ? peeled_mc_quarter
|
|
2601
|
+
: Pack2 > 1 && last_lhs_progress ? (rows / last_lhs_progress) * last_lhs_progress
|
|
2602
|
+
: 0;
|
|
2123
2603
|
|
|
2124
|
-
Index i=0;
|
|
2604
|
+
Index i = 0;
|
|
2125
2605
|
|
|
2126
2606
|
// Pack 3 packets
|
|
2127
|
-
if(Pack1>=3*PacketSize)
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
{
|
|
2131
|
-
if(PanelMode) count += (3*PacketSize) * offset;
|
|
2607
|
+
if (Pack1 >= 3 * PacketSize) {
|
|
2608
|
+
for (; i < peeled_mc3; i += 3 * PacketSize) {
|
|
2609
|
+
if (PanelMode) count += (3 * PacketSize) * offset;
|
|
2132
2610
|
|
|
2133
|
-
for(Index k=0; k<depth; k++)
|
|
2134
|
-
{
|
|
2611
|
+
for (Index k = 0; k < depth; k++) {
|
|
2135
2612
|
Packet A, B, C;
|
|
2136
|
-
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2137
|
-
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
|
|
2138
|
-
C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
|
|
2139
|
-
pstore(blockA+count, cj.pconj(A));
|
|
2140
|
-
|
|
2141
|
-
pstore(blockA+count, cj.pconj(
|
|
2613
|
+
A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
|
|
2614
|
+
B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
|
|
2615
|
+
C = lhs.template loadPacket<Packet>(i + 2 * PacketSize, k);
|
|
2616
|
+
pstore(blockA + count, cj.pconj(A));
|
|
2617
|
+
count += PacketSize;
|
|
2618
|
+
pstore(blockA + count, cj.pconj(B));
|
|
2619
|
+
count += PacketSize;
|
|
2620
|
+
pstore(blockA + count, cj.pconj(C));
|
|
2621
|
+
count += PacketSize;
|
|
2142
2622
|
}
|
|
2143
|
-
if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
|
|
2623
|
+
if (PanelMode) count += (3 * PacketSize) * (stride - offset - depth);
|
|
2144
2624
|
}
|
|
2145
2625
|
}
|
|
2146
2626
|
// Pack 2 packets
|
|
2147
|
-
if(Pack1>=2*PacketSize)
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
{
|
|
2151
|
-
if(PanelMode) count += (2*PacketSize) * offset;
|
|
2627
|
+
if (Pack1 >= 2 * PacketSize) {
|
|
2628
|
+
for (; i < peeled_mc2; i += 2 * PacketSize) {
|
|
2629
|
+
if (PanelMode) count += (2 * PacketSize) * offset;
|
|
2152
2630
|
|
|
2153
|
-
for(Index k=0; k<depth; k++)
|
|
2154
|
-
{
|
|
2631
|
+
for (Index k = 0; k < depth; k++) {
|
|
2155
2632
|
Packet A, B;
|
|
2156
|
-
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2157
|
-
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
|
|
2158
|
-
pstore(blockA+count, cj.pconj(A));
|
|
2159
|
-
|
|
2633
|
+
A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
|
|
2634
|
+
B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
|
|
2635
|
+
pstore(blockA + count, cj.pconj(A));
|
|
2636
|
+
count += PacketSize;
|
|
2637
|
+
pstore(blockA + count, cj.pconj(B));
|
|
2638
|
+
count += PacketSize;
|
|
2160
2639
|
}
|
|
2161
|
-
if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
|
|
2640
|
+
if (PanelMode) count += (2 * PacketSize) * (stride - offset - depth);
|
|
2162
2641
|
}
|
|
2163
2642
|
}
|
|
2164
2643
|
// Pack 1 packets
|
|
2165
|
-
if(Pack1>=1*PacketSize)
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
{
|
|
2169
|
-
if(PanelMode) count += (1*PacketSize) * offset;
|
|
2644
|
+
if (Pack1 >= 1 * PacketSize) {
|
|
2645
|
+
for (; i < peeled_mc1; i += 1 * PacketSize) {
|
|
2646
|
+
if (PanelMode) count += (1 * PacketSize) * offset;
|
|
2170
2647
|
|
|
2171
|
-
for(Index k=0; k<depth; k++)
|
|
2172
|
-
{
|
|
2648
|
+
for (Index k = 0; k < depth; k++) {
|
|
2173
2649
|
Packet A;
|
|
2174
|
-
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
|
|
2175
|
-
pstore(blockA+count, cj.pconj(A));
|
|
2176
|
-
count+=PacketSize;
|
|
2650
|
+
A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
|
|
2651
|
+
pstore(blockA + count, cj.pconj(A));
|
|
2652
|
+
count += PacketSize;
|
|
2177
2653
|
}
|
|
2178
|
-
if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
|
|
2654
|
+
if (PanelMode) count += (1 * PacketSize) * (stride - offset - depth);
|
|
2179
2655
|
}
|
|
2180
2656
|
}
|
|
2181
2657
|
// Pack half packets
|
|
2182
|
-
if(HasHalf && Pack1>=HalfPacketSize)
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
{
|
|
2186
|
-
if(PanelMode) count += (HalfPacketSize) * offset;
|
|
2658
|
+
if (HasHalf && Pack1 >= HalfPacketSize) {
|
|
2659
|
+
for (; i < peeled_mc_half; i += HalfPacketSize) {
|
|
2660
|
+
if (PanelMode) count += (HalfPacketSize)*offset;
|
|
2187
2661
|
|
|
2188
|
-
for(Index k=0; k<depth; k++)
|
|
2189
|
-
{
|
|
2662
|
+
for (Index k = 0; k < depth; k++) {
|
|
2190
2663
|
HalfPacket A;
|
|
2191
|
-
A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
|
|
2192
|
-
pstoreu(blockA+count, cj.pconj(A));
|
|
2193
|
-
count+=HalfPacketSize;
|
|
2664
|
+
A = lhs.template loadPacket<HalfPacket>(i + 0 * (HalfPacketSize), k);
|
|
2665
|
+
pstoreu(blockA + count, cj.pconj(A));
|
|
2666
|
+
count += HalfPacketSize;
|
|
2194
2667
|
}
|
|
2195
|
-
if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
|
|
2668
|
+
if (PanelMode) count += (HalfPacketSize) * (stride - offset - depth);
|
|
2196
2669
|
}
|
|
2197
2670
|
}
|
|
2198
2671
|
// Pack quarter packets
|
|
2199
|
-
if(HasQuarter && Pack1>=QuarterPacketSize)
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
{
|
|
2203
|
-
if(PanelMode) count += (QuarterPacketSize) * offset;
|
|
2672
|
+
if (HasQuarter && Pack1 >= QuarterPacketSize) {
|
|
2673
|
+
for (; i < peeled_mc_quarter; i += QuarterPacketSize) {
|
|
2674
|
+
if (PanelMode) count += (QuarterPacketSize)*offset;
|
|
2204
2675
|
|
|
2205
|
-
for(Index k=0; k<depth; k++)
|
|
2206
|
-
{
|
|
2676
|
+
for (Index k = 0; k < depth; k++) {
|
|
2207
2677
|
QuarterPacket A;
|
|
2208
|
-
A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
|
|
2209
|
-
pstoreu(blockA+count, cj.pconj(A));
|
|
2210
|
-
count+=QuarterPacketSize;
|
|
2678
|
+
A = lhs.template loadPacket<QuarterPacket>(i + 0 * (QuarterPacketSize), k);
|
|
2679
|
+
pstoreu(blockA + count, cj.pconj(A));
|
|
2680
|
+
count += QuarterPacketSize;
|
|
2211
2681
|
}
|
|
2212
|
-
if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
|
|
2682
|
+
if (PanelMode) count += (QuarterPacketSize) * (stride - offset - depth);
|
|
2213
2683
|
}
|
|
2214
2684
|
}
|
|
2215
2685
|
// Pack2 may be *smaller* than PacketSize—that happens for
|
|
@@ -2218,128 +2688,118 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
|
|
|
2218
2688
|
// address both real & imaginary parts on the rhs. This portion will
|
|
2219
2689
|
// pack those half ones until they match the number expected on the
|
|
2220
2690
|
// last peeling loop at this point (for the rhs).
|
|
2221
|
-
if(Pack2<PacketSize && Pack2>1)
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
{
|
|
2225
|
-
if(PanelMode) count += last_lhs_progress * offset;
|
|
2691
|
+
if (Pack2 < PacketSize && Pack2 > 1) {
|
|
2692
|
+
for (; i < peeled_mc0; i += last_lhs_progress) {
|
|
2693
|
+
if (PanelMode) count += last_lhs_progress * offset;
|
|
2226
2694
|
|
|
2227
|
-
for(Index k=0; k<depth; k++)
|
|
2228
|
-
for(Index w=0; w<last_lhs_progress; w++)
|
|
2229
|
-
blockA[count++] = cj(lhs(i+w, k));
|
|
2695
|
+
for (Index k = 0; k < depth; k++)
|
|
2696
|
+
for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k));
|
|
2230
2697
|
|
|
2231
|
-
if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
|
|
2698
|
+
if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
|
|
2232
2699
|
}
|
|
2233
2700
|
}
|
|
2234
2701
|
// Pack scalars
|
|
2235
|
-
for(; i<rows; i++)
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
blockA[count++] = cj(lhs(i, k));
|
|
2240
|
-
if(PanelMode) count += (stride-offset-depth);
|
|
2702
|
+
for (; i < rows; i++) {
|
|
2703
|
+
if (PanelMode) count += offset;
|
|
2704
|
+
for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
|
|
2705
|
+
if (PanelMode) count += (stride - offset - depth);
|
|
2241
2706
|
}
|
|
2242
2707
|
}
|
|
2243
2708
|
|
|
2244
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2245
|
-
|
|
2246
|
-
{
|
|
2709
|
+
template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2710
|
+
bool PanelMode>
|
|
2711
|
+
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
|
|
2247
2712
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
2248
|
-
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0,
|
|
2713
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
|
|
2714
|
+
Index offset = 0);
|
|
2249
2715
|
};
|
|
2250
2716
|
|
|
2251
|
-
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2717
|
+
template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
|
|
2718
|
+
bool PanelMode>
|
|
2719
|
+
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
|
|
2720
|
+
PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
|
|
2721
|
+
Index rows, Index stride, Index offset) {
|
|
2255
2722
|
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2256
2723
|
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2257
|
-
enum {
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2724
|
+
enum {
|
|
2725
|
+
PacketSize = unpacket_traits<Packet>::size,
|
|
2726
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
2727
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
2728
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
2729
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
|
|
2730
|
+
};
|
|
2262
2731
|
|
|
2263
2732
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
|
|
2264
2733
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
2265
2734
|
EIGEN_UNUSED_VARIABLE(offset);
|
|
2266
|
-
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2735
|
+
eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
|
|
2267
2736
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2268
2737
|
Index count = 0;
|
|
2269
2738
|
bool gone_half = false, gone_quarter = false, gone_last = false;
|
|
2270
2739
|
|
|
2271
2740
|
Index i = 0;
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
while(pack>0)
|
|
2275
|
-
|
|
2276
|
-
Index
|
|
2277
|
-
Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
|
|
2741
|
+
Index pack = Pack1;
|
|
2742
|
+
Index psize = PacketSize;
|
|
2743
|
+
while (pack > 0) {
|
|
2744
|
+
Index remaining_rows = rows - i;
|
|
2745
|
+
Index peeled_mc = gone_last ? Pack2 > 1 ? (rows / pack) * pack : 0 : i + (remaining_rows / pack) * pack;
|
|
2278
2746
|
Index starting_pos = i;
|
|
2279
|
-
for(; i<peeled_mc; i+=pack)
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
for(; k<peeled_k; k+=psize)
|
|
2288
|
-
{
|
|
2289
|
-
for (Index m = 0; m < pack; m += psize)
|
|
2290
|
-
{
|
|
2747
|
+
for (; i < peeled_mc; i += pack) {
|
|
2748
|
+
if (PanelMode) count += pack * offset;
|
|
2749
|
+
|
|
2750
|
+
Index k = 0;
|
|
2751
|
+
if (pack >= psize && psize >= QuarterPacketSize) {
|
|
2752
|
+
const Index peeled_k = (depth / psize) * psize;
|
|
2753
|
+
for (; k < peeled_k; k += psize) {
|
|
2754
|
+
for (Index m = 0; m < pack; m += psize) {
|
|
2291
2755
|
if (psize == PacketSize) {
|
|
2292
2756
|
PacketBlock<Packet> kernel;
|
|
2293
|
-
for (
|
|
2757
|
+
for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i + p + m, k);
|
|
2294
2758
|
ptranspose(kernel);
|
|
2295
|
-
for (
|
|
2759
|
+
for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel.packet[p]));
|
|
2296
2760
|
} else if (HasHalf && psize == HalfPacketSize) {
|
|
2297
2761
|
gone_half = true;
|
|
2298
2762
|
PacketBlock<HalfPacket> kernel_half;
|
|
2299
|
-
for (
|
|
2763
|
+
for (Index p = 0; p < psize; ++p)
|
|
2764
|
+
kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i + p + m, k);
|
|
2300
2765
|
ptranspose(kernel_half);
|
|
2301
|
-
for (
|
|
2766
|
+
for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_half.packet[p]));
|
|
2302
2767
|
} else if (HasQuarter && psize == QuarterPacketSize) {
|
|
2303
2768
|
gone_quarter = true;
|
|
2304
2769
|
PacketBlock<QuarterPacket> kernel_quarter;
|
|
2305
|
-
for (
|
|
2770
|
+
for (Index p = 0; p < psize; ++p)
|
|
2771
|
+
kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i + p + m, k);
|
|
2306
2772
|
ptranspose(kernel_quarter);
|
|
2307
|
-
for (
|
|
2308
|
-
|
|
2773
|
+
for (Index p = 0; p < psize; ++p)
|
|
2774
|
+
pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_quarter.packet[p]));
|
|
2775
|
+
}
|
|
2309
2776
|
}
|
|
2310
|
-
count += psize*pack;
|
|
2777
|
+
count += psize * pack;
|
|
2311
2778
|
}
|
|
2312
2779
|
}
|
|
2313
2780
|
|
|
2314
|
-
for(; k<depth; k++)
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
{
|
|
2319
|
-
Scalar a(cj(lhs(i+w+0, k))),
|
|
2320
|
-
b(cj(lhs(i+w+1, k))),
|
|
2321
|
-
c(cj(lhs(i+w+2, k))),
|
|
2322
|
-
d(cj(lhs(i+w+3, k)));
|
|
2781
|
+
for (; k < depth; k++) {
|
|
2782
|
+
Index w = 0;
|
|
2783
|
+
for (; w < pack - 3; w += 4) {
|
|
2784
|
+
Scalar a(cj(lhs(i + w + 0, k))), b(cj(lhs(i + w + 1, k))), c(cj(lhs(i + w + 2, k))), d(cj(lhs(i + w + 3, k)));
|
|
2323
2785
|
blockA[count++] = a;
|
|
2324
2786
|
blockA[count++] = b;
|
|
2325
2787
|
blockA[count++] = c;
|
|
2326
2788
|
blockA[count++] = d;
|
|
2327
2789
|
}
|
|
2328
|
-
if(pack%4)
|
|
2329
|
-
for(;w<pack
|
|
2330
|
-
blockA[count++] = cj(lhs(i+w, k));
|
|
2790
|
+
if (pack % 4)
|
|
2791
|
+
for (; w < pack; ++w) blockA[count++] = cj(lhs(i + w, k));
|
|
2331
2792
|
}
|
|
2332
2793
|
|
|
2333
|
-
if(PanelMode) count += pack * (stride-offset-depth);
|
|
2794
|
+
if (PanelMode) count += pack * (stride - offset - depth);
|
|
2334
2795
|
}
|
|
2335
2796
|
|
|
2336
2797
|
pack -= psize;
|
|
2337
2798
|
Index left = rows - i;
|
|
2338
2799
|
if (pack <= 0) {
|
|
2339
|
-
if (!gone_last &&
|
|
2340
|
-
(
|
|
2341
|
-
|
|
2342
|
-
(psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
|
|
2800
|
+
if (!gone_last && (starting_pos == i || left >= psize / 2 || left >= psize / 4) &&
|
|
2801
|
+
((psize / 2 == HalfPacketSize && HasHalf && !gone_half) ||
|
|
2802
|
+
(psize / 2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
|
|
2343
2803
|
psize /= 2;
|
|
2344
2804
|
pack = psize;
|
|
2345
2805
|
continue;
|
|
@@ -2357,12 +2817,10 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
|
|
|
2357
2817
|
}
|
|
2358
2818
|
}
|
|
2359
2819
|
|
|
2360
|
-
for(; i<rows; i++)
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
|
|
2364
|
-
blockA[count++] = cj(lhs(i, k));
|
|
2365
|
-
if(PanelMode) count += (stride-offset-depth);
|
|
2820
|
+
for (; i < rows; i++) {
|
|
2821
|
+
if (PanelMode) count += offset;
|
|
2822
|
+
for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
|
|
2823
|
+
if (PanelMode) count += (stride - offset - depth);
|
|
2366
2824
|
}
|
|
2367
2825
|
}
|
|
2368
2826
|
|
|
@@ -2373,273 +2831,323 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
|
|
|
2373
2831
|
// 4 5 6 7 16 17 18 19 25 28
|
|
2374
2832
|
// 8 9 10 11 20 21 22 23 26 29
|
|
2375
2833
|
// . . . . . . . . . .
|
|
2376
|
-
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2377
|
-
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
|
2378
|
-
{
|
|
2834
|
+
template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2835
|
+
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
|
|
2379
2836
|
typedef typename packet_traits<Scalar>::type Packet;
|
|
2380
2837
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
2381
2838
|
enum { PacketSize = packet_traits<Scalar>::size };
|
|
2382
|
-
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0,
|
|
2839
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
|
|
2840
|
+
Index offset = 0);
|
|
2383
2841
|
};
|
|
2384
2842
|
|
|
2385
|
-
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2386
|
-
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode
|
|
2387
|
-
|
|
2388
|
-
{
|
|
2843
|
+
template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2844
|
+
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
|
|
2845
|
+
Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
|
|
2389
2846
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
|
|
2390
2847
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
2391
2848
|
EIGEN_UNUSED_VARIABLE(offset);
|
|
2392
|
-
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
2849
|
+
eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
|
|
2393
2850
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2394
|
-
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
|
2395
|
-
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
2851
|
+
Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
|
|
2852
|
+
Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
|
|
2396
2853
|
Index count = 0;
|
|
2397
|
-
const Index peeled_k = (depth/PacketSize)*PacketSize;
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
//
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2854
|
+
const Index peeled_k = (depth / PacketSize) * PacketSize;
|
|
2855
|
+
|
|
2856
|
+
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
|
|
2857
|
+
EIGEN_IF_CONSTEXPR(nr >= 8) {
|
|
2858
|
+
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
|
|
2859
|
+
// skip what we have before
|
|
2860
|
+
if (PanelMode) count += 8 * offset;
|
|
2861
|
+
const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
|
|
2862
|
+
const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
|
|
2863
|
+
const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
|
|
2864
|
+
const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
|
|
2865
|
+
const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
|
|
2866
|
+
const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
|
|
2867
|
+
const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
|
|
2868
|
+
const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
|
|
2869
|
+
Index k = 0;
|
|
2870
|
+
if (PacketSize % 2 == 0 && PacketSize <= 8) // 2 4 8
|
|
2871
|
+
{
|
|
2872
|
+
for (; k < peeled_k; k += PacketSize) {
|
|
2873
|
+
if (PacketSize == 2) {
|
|
2874
|
+
PacketBlock<Packet, PacketSize == 2 ? 2 : PacketSize> kernel0, kernel1, kernel2, kernel3;
|
|
2875
|
+
kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
|
|
2876
|
+
kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2877
|
+
kernel1.packet[0 % PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2878
|
+
kernel1.packet[1 % PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
2879
|
+
kernel2.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
|
|
2880
|
+
kernel2.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
|
|
2881
|
+
kernel3.packet[0 % PacketSize] = dm6.template loadPacket<Packet>(k);
|
|
2882
|
+
kernel3.packet[1 % PacketSize] = dm7.template loadPacket<Packet>(k);
|
|
2883
|
+
ptranspose(kernel0);
|
|
2884
|
+
ptranspose(kernel1);
|
|
2885
|
+
ptranspose(kernel2);
|
|
2886
|
+
ptranspose(kernel3);
|
|
2887
|
+
|
|
2888
|
+
pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
|
|
2889
|
+
pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
|
|
2890
|
+
pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
|
|
2891
|
+
pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
|
|
2892
|
+
|
|
2893
|
+
pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
|
|
2894
|
+
pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
|
|
2895
|
+
pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
|
|
2896
|
+
pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
|
|
2897
|
+
count += 8 * PacketSize;
|
|
2898
|
+
} else if (PacketSize == 4) {
|
|
2899
|
+
PacketBlock<Packet, PacketSize == 4 ? 4 : PacketSize> kernel0, kernel1;
|
|
2900
|
+
|
|
2901
|
+
kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
|
|
2902
|
+
kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2903
|
+
kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2904
|
+
kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
2905
|
+
kernel1.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
|
|
2906
|
+
kernel1.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
|
|
2907
|
+
kernel1.packet[2 % PacketSize] = dm6.template loadPacket<Packet>(k);
|
|
2908
|
+
kernel1.packet[3 % PacketSize] = dm7.template loadPacket<Packet>(k);
|
|
2909
|
+
ptranspose(kernel0);
|
|
2910
|
+
ptranspose(kernel1);
|
|
2911
|
+
|
|
2912
|
+
pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
|
|
2913
|
+
pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
|
|
2914
|
+
pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
|
|
2915
|
+
pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
|
|
2916
|
+
pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
|
|
2917
|
+
pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[2 % PacketSize]));
|
|
2918
|
+
pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
|
|
2919
|
+
pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel1.packet[3 % PacketSize]));
|
|
2920
|
+
count += 8 * PacketSize;
|
|
2921
|
+
} else if (PacketSize == 8) {
|
|
2922
|
+
PacketBlock<Packet, PacketSize == 8 ? 8 : PacketSize> kernel0;
|
|
2923
|
+
|
|
2924
|
+
kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
|
|
2925
|
+
kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2926
|
+
kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2927
|
+
kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
2928
|
+
kernel0.packet[4 % PacketSize] = dm4.template loadPacket<Packet>(k);
|
|
2929
|
+
kernel0.packet[5 % PacketSize] = dm5.template loadPacket<Packet>(k);
|
|
2930
|
+
kernel0.packet[6 % PacketSize] = dm6.template loadPacket<Packet>(k);
|
|
2931
|
+
kernel0.packet[7 % PacketSize] = dm7.template loadPacket<Packet>(k);
|
|
2932
|
+
ptranspose(kernel0);
|
|
2933
|
+
|
|
2934
|
+
pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
|
|
2935
|
+
pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
|
|
2936
|
+
pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
|
|
2937
|
+
pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
|
|
2938
|
+
pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[4 % PacketSize]));
|
|
2939
|
+
pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel0.packet[5 % PacketSize]));
|
|
2940
|
+
pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[6 % PacketSize]));
|
|
2941
|
+
pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel0.packet[7 % PacketSize]));
|
|
2942
|
+
count += 8 * PacketSize;
|
|
2943
|
+
}
|
|
2944
|
+
}
|
|
2945
|
+
}
|
|
2946
|
+
|
|
2947
|
+
for (; k < depth; k++) {
|
|
2948
|
+
blockB[count + 0] = cj(dm0(k));
|
|
2949
|
+
blockB[count + 1] = cj(dm1(k));
|
|
2950
|
+
blockB[count + 2] = cj(dm2(k));
|
|
2951
|
+
blockB[count + 3] = cj(dm3(k));
|
|
2952
|
+
blockB[count + 4] = cj(dm4(k));
|
|
2953
|
+
blockB[count + 5] = cj(dm5(k));
|
|
2954
|
+
blockB[count + 6] = cj(dm6(k));
|
|
2955
|
+
blockB[count + 7] = cj(dm7(k));
|
|
2956
|
+
count += 8;
|
|
2957
|
+
}
|
|
2958
|
+
// skip what we have after
|
|
2959
|
+
if (PanelMode) count += 8 * (stride - offset - depth);
|
|
2960
|
+
}
|
|
2961
|
+
}
|
|
2962
|
+
#endif
|
|
2963
|
+
|
|
2964
|
+
EIGEN_IF_CONSTEXPR(nr >= 4) {
|
|
2965
|
+
for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
|
|
2448
2966
|
// skip what we have before
|
|
2449
|
-
if(PanelMode) count += 4 * offset;
|
|
2967
|
+
if (PanelMode) count += 4 * offset;
|
|
2450
2968
|
const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
|
|
2451
2969
|
const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
|
|
2452
2970
|
const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
|
|
2453
2971
|
const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
|
|
2454
2972
|
|
|
2455
|
-
Index k=0;
|
|
2456
|
-
if((PacketSize%4)==0)
|
|
2973
|
+
Index k = 0;
|
|
2974
|
+
if ((PacketSize % 4) == 0) // TODO enable vectorized transposition for PacketSize==2 ??
|
|
2457
2975
|
{
|
|
2458
|
-
for(; k<peeled_k; k+=PacketSize) {
|
|
2459
|
-
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
|
|
2460
|
-
kernel.packet[0
|
|
2461
|
-
kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2462
|
-
kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2463
|
-
kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
2976
|
+
for (; k < peeled_k; k += PacketSize) {
|
|
2977
|
+
PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
|
|
2978
|
+
kernel.packet[0] = dm0.template loadPacket<Packet>(k);
|
|
2979
|
+
kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
|
|
2980
|
+
kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
|
|
2981
|
+
kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
|
|
2464
2982
|
ptranspose(kernel);
|
|
2465
|
-
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
|
|
2466
|
-
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
|
|
2467
|
-
pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
|
|
2468
|
-
pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
|
|
2469
|
-
count+=4*PacketSize;
|
|
2983
|
+
pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
|
|
2984
|
+
pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
|
|
2985
|
+
pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
|
|
2986
|
+
pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
|
|
2987
|
+
count += 4 * PacketSize;
|
|
2470
2988
|
}
|
|
2471
2989
|
}
|
|
2472
|
-
for(; k<depth; k++)
|
|
2473
|
-
|
|
2474
|
-
blockB[count+
|
|
2475
|
-
blockB[count+
|
|
2476
|
-
blockB[count+
|
|
2477
|
-
blockB[count+3] = cj(dm3(k));
|
|
2990
|
+
for (; k < depth; k++) {
|
|
2991
|
+
blockB[count + 0] = cj(dm0(k));
|
|
2992
|
+
blockB[count + 1] = cj(dm1(k));
|
|
2993
|
+
blockB[count + 2] = cj(dm2(k));
|
|
2994
|
+
blockB[count + 3] = cj(dm3(k));
|
|
2478
2995
|
count += 4;
|
|
2479
2996
|
}
|
|
2480
2997
|
// skip what we have after
|
|
2481
|
-
if(PanelMode) count += 4 * (stride-offset-depth);
|
|
2998
|
+
if (PanelMode) count += 4 * (stride - offset - depth);
|
|
2482
2999
|
}
|
|
2483
3000
|
}
|
|
2484
3001
|
|
|
2485
3002
|
// copy the remaining columns one at a time (nr==1)
|
|
2486
|
-
for(Index j2=packet_cols4; j2<cols; ++j2)
|
|
2487
|
-
|
|
2488
|
-
if(PanelMode) count += offset;
|
|
3003
|
+
for (Index j2 = packet_cols4; j2 < cols; ++j2) {
|
|
3004
|
+
if (PanelMode) count += offset;
|
|
2489
3005
|
const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
|
|
2490
|
-
for(Index k=0; k<depth; k++)
|
|
2491
|
-
{
|
|
3006
|
+
for (Index k = 0; k < depth; k++) {
|
|
2492
3007
|
blockB[count] = cj(dm0(k));
|
|
2493
3008
|
count += 1;
|
|
2494
3009
|
}
|
|
2495
|
-
if(PanelMode) count += (stride-offset-depth);
|
|
3010
|
+
if (PanelMode) count += (stride - offset - depth);
|
|
2496
3011
|
}
|
|
2497
3012
|
}
|
|
2498
3013
|
|
|
2499
3014
|
// this version is optimized for row major matrices
|
|
2500
|
-
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
2501
|
-
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
|
|
2502
|
-
{
|
|
3015
|
+
template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
|
3016
|
+
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
|
|
2503
3017
|
typedef typename packet_traits<Scalar>::type Packet;
|
|
2504
3018
|
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
|
2505
3019
|
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
|
|
2506
3020
|
typedef typename DataMapper::LinearMapper LinearMapper;
|
|
2507
|
-
enum {
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
3021
|
+
enum {
|
|
3022
|
+
PacketSize = packet_traits<Scalar>::size,
|
|
3023
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
3024
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size
|
|
3025
|
+
};
|
|
3026
|
+
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
|
|
3027
|
+
Index offset = 0) {
|
|
2512
3028
|
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
|
|
2513
3029
|
EIGEN_UNUSED_VARIABLE(stride);
|
|
2514
3030
|
EIGEN_UNUSED_VARIABLE(offset);
|
|
2515
|
-
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
|
|
3031
|
+
eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
|
|
2516
3032
|
const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
|
|
2517
3033
|
const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
|
|
2518
3034
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
|
2519
|
-
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
|
2520
|
-
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
|
3035
|
+
Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
|
|
3036
|
+
Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
|
|
2521
3037
|
Index count = 0;
|
|
2522
3038
|
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
// {
|
|
2527
|
-
// // skip what we have before
|
|
2528
|
-
// if(PanelMode) count += 8 * offset;
|
|
2529
|
-
// for(Index k=0; k<depth; k++)
|
|
2530
|
-
// {
|
|
2531
|
-
// if (PacketSize==8) {
|
|
2532
|
-
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2533
|
-
// pstoreu(blockB+count, cj.pconj(A));
|
|
2534
|
-
// } else if (PacketSize==4) {
|
|
2535
|
-
// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
|
|
2536
|
-
// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
|
|
2537
|
-
// pstoreu(blockB+count, cj.pconj(A));
|
|
2538
|
-
// pstoreu(blockB+count+PacketSize, cj.pconj(B));
|
|
2539
|
-
// } else {
|
|
2540
|
-
// const Scalar* b0 = &rhs[k*rhsStride + j2];
|
|
2541
|
-
// blockB[count+0] = cj(b0[0]);
|
|
2542
|
-
// blockB[count+1] = cj(b0[1]);
|
|
2543
|
-
// blockB[count+2] = cj(b0[2]);
|
|
2544
|
-
// blockB[count+3] = cj(b0[3]);
|
|
2545
|
-
// blockB[count+4] = cj(b0[4]);
|
|
2546
|
-
// blockB[count+5] = cj(b0[5]);
|
|
2547
|
-
// blockB[count+6] = cj(b0[6]);
|
|
2548
|
-
// blockB[count+7] = cj(b0[7]);
|
|
2549
|
-
// }
|
|
2550
|
-
// count += 8;
|
|
2551
|
-
// }
|
|
2552
|
-
// // skip what we have after
|
|
2553
|
-
// if(PanelMode) count += 8 * (stride-offset-depth);
|
|
2554
|
-
// }
|
|
2555
|
-
// }
|
|
2556
|
-
if(nr>=4)
|
|
2557
|
-
{
|
|
2558
|
-
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
|
|
2559
|
-
{
|
|
3039
|
+
#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
|
|
3040
|
+
EIGEN_IF_CONSTEXPR(nr >= 8) {
|
|
3041
|
+
for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
|
|
2560
3042
|
// skip what we have before
|
|
2561
|
-
if(PanelMode) count +=
|
|
2562
|
-
for(Index k=0; k<depth; k++)
|
|
2563
|
-
|
|
2564
|
-
if (PacketSize==4) {
|
|
3043
|
+
if (PanelMode) count += 8 * offset;
|
|
3044
|
+
for (Index k = 0; k < depth; k++) {
|
|
3045
|
+
if (PacketSize == 8) {
|
|
2565
3046
|
Packet A = rhs.template loadPacket<Packet>(k, j2);
|
|
2566
|
-
pstoreu(blockB+count, cj.pconj(A));
|
|
3047
|
+
pstoreu(blockB + count, cj.pconj(A));
|
|
2567
3048
|
count += PacketSize;
|
|
2568
|
-
} else if (
|
|
3049
|
+
} else if (PacketSize == 4) {
|
|
3050
|
+
Packet A = rhs.template loadPacket<Packet>(k, j2);
|
|
3051
|
+
Packet B = rhs.template loadPacket<Packet>(k, j2 + 4);
|
|
3052
|
+
pstoreu(blockB + count, cj.pconj(A));
|
|
3053
|
+
pstoreu(blockB + count + PacketSize, cj.pconj(B));
|
|
3054
|
+
count += 2 * PacketSize;
|
|
3055
|
+
} else {
|
|
3056
|
+
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
|
|
3057
|
+
blockB[count + 0] = cj(dm0(0));
|
|
3058
|
+
blockB[count + 1] = cj(dm0(1));
|
|
3059
|
+
blockB[count + 2] = cj(dm0(2));
|
|
3060
|
+
blockB[count + 3] = cj(dm0(3));
|
|
3061
|
+
blockB[count + 4] = cj(dm0(4));
|
|
3062
|
+
blockB[count + 5] = cj(dm0(5));
|
|
3063
|
+
blockB[count + 6] = cj(dm0(6));
|
|
3064
|
+
blockB[count + 7] = cj(dm0(7));
|
|
3065
|
+
count += 8;
|
|
3066
|
+
}
|
|
3067
|
+
}
|
|
3068
|
+
// skip what we have after
|
|
3069
|
+
if (PanelMode) count += 8 * (stride - offset - depth);
|
|
3070
|
+
}
|
|
3071
|
+
}
|
|
3072
|
+
#endif
|
|
3073
|
+
|
|
3074
|
+
if (nr >= 4) {
|
|
3075
|
+
for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
|
|
3076
|
+
// skip what we have before
|
|
3077
|
+
if (PanelMode) count += 4 * offset;
|
|
3078
|
+
for (Index k = 0; k < depth; k++) {
|
|
3079
|
+
if (PacketSize == 4) {
|
|
3080
|
+
Packet A = rhs.template loadPacket<Packet>(k, j2);
|
|
3081
|
+
pstoreu(blockB + count, cj.pconj(A));
|
|
3082
|
+
count += PacketSize;
|
|
3083
|
+
} else if (HasHalf && HalfPacketSize == 4) {
|
|
2569
3084
|
HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
|
|
2570
|
-
pstoreu(blockB+count, cj.pconj(A));
|
|
3085
|
+
pstoreu(blockB + count, cj.pconj(A));
|
|
2571
3086
|
count += HalfPacketSize;
|
|
2572
|
-
} else if (HasQuarter && QuarterPacketSize==4) {
|
|
3087
|
+
} else if (HasQuarter && QuarterPacketSize == 4) {
|
|
2573
3088
|
QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
|
|
2574
|
-
pstoreu(blockB+count, cj.pconj(A));
|
|
3089
|
+
pstoreu(blockB + count, cj.pconj(A));
|
|
2575
3090
|
count += QuarterPacketSize;
|
|
2576
3091
|
} else {
|
|
2577
3092
|
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
|
|
2578
|
-
blockB[count+0] = cj(dm0(0));
|
|
2579
|
-
blockB[count+1] = cj(dm0(1));
|
|
2580
|
-
blockB[count+2] = cj(dm0(2));
|
|
2581
|
-
blockB[count+3] = cj(dm0(3));
|
|
3093
|
+
blockB[count + 0] = cj(dm0(0));
|
|
3094
|
+
blockB[count + 1] = cj(dm0(1));
|
|
3095
|
+
blockB[count + 2] = cj(dm0(2));
|
|
3096
|
+
blockB[count + 3] = cj(dm0(3));
|
|
2582
3097
|
count += 4;
|
|
2583
3098
|
}
|
|
2584
3099
|
}
|
|
2585
3100
|
// skip what we have after
|
|
2586
|
-
if(PanelMode) count += 4 * (stride-offset-depth);
|
|
3101
|
+
if (PanelMode) count += 4 * (stride - offset - depth);
|
|
2587
3102
|
}
|
|
2588
3103
|
}
|
|
2589
3104
|
// copy the remaining columns one at a time (nr==1)
|
|
2590
|
-
for(Index j2=packet_cols4; j2<cols; ++j2)
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
for(Index k=0; k<depth; k++)
|
|
2594
|
-
{
|
|
3105
|
+
for (Index j2 = packet_cols4; j2 < cols; ++j2) {
|
|
3106
|
+
if (PanelMode) count += offset;
|
|
3107
|
+
for (Index k = 0; k < depth; k++) {
|
|
2595
3108
|
blockB[count] = cj(rhs(k, j2));
|
|
2596
3109
|
count += 1;
|
|
2597
3110
|
}
|
|
2598
|
-
if(PanelMode) count += stride-offset-depth;
|
|
3111
|
+
if (PanelMode) count += stride - offset - depth;
|
|
2599
3112
|
}
|
|
2600
3113
|
}
|
|
2601
3114
|
};
|
|
2602
3115
|
|
|
2603
|
-
}
|
|
3116
|
+
} // end namespace internal
|
|
2604
3117
|
|
|
2605
3118
|
/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
|
|
2606
|
-
|
|
2607
|
-
inline std::ptrdiff_t l1CacheSize()
|
|
2608
|
-
{
|
|
3119
|
+
* \sa setCpuCacheSize */
|
|
3120
|
+
inline std::ptrdiff_t l1CacheSize() {
|
|
2609
3121
|
std::ptrdiff_t l1, l2, l3;
|
|
2610
3122
|
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
|
2611
3123
|
return l1;
|
|
2612
3124
|
}
|
|
2613
3125
|
|
|
2614
3126
|
/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
|
|
2615
|
-
|
|
2616
|
-
inline std::ptrdiff_t l2CacheSize()
|
|
2617
|
-
{
|
|
3127
|
+
* \sa setCpuCacheSize */
|
|
3128
|
+
inline std::ptrdiff_t l2CacheSize() {
|
|
2618
3129
|
std::ptrdiff_t l1, l2, l3;
|
|
2619
3130
|
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
|
2620
3131
|
return l2;
|
|
2621
3132
|
}
|
|
2622
3133
|
|
|
2623
|
-
/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
inline std::ptrdiff_t l3CacheSize()
|
|
2627
|
-
{
|
|
3134
|
+
/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
|
|
3135
|
+
* \sa setCpuCacheSize */
|
|
3136
|
+
inline std::ptrdiff_t l3CacheSize() {
|
|
2628
3137
|
std::ptrdiff_t l1, l2, l3;
|
|
2629
3138
|
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
|
2630
3139
|
return l3;
|
|
2631
3140
|
}
|
|
2632
3141
|
|
|
2633
3142
|
/** Set the cpu L1 and L2 cache sizes (in bytes).
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
|
|
2639
|
-
{
|
|
3143
|
+
* These values are use to adjust the size of the blocks
|
|
3144
|
+
* for the algorithms working per blocks.
|
|
3145
|
+
*
|
|
3146
|
+
* \sa computeProductBlockingSizes */
|
|
3147
|
+
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) {
|
|
2640
3148
|
internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
|
|
2641
3149
|
}
|
|
2642
3150
|
|
|
2643
|
-
}
|
|
3151
|
+
} // end namespace Eigen
|
|
2644
3152
|
|
|
2645
|
-
#endif
|
|
3153
|
+
#endif // EIGEN_GENERAL_BLOCK_PANEL_H
|