@smake/eigen 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -21
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +235 -326
- package/eigen/Eigen/Eigenvalues +16 -14
- package/eigen/Eigen/Geometry +21 -24
- package/eigen/Eigen/Householder +9 -8
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -14
- package/eigen/Eigen/KLUSupport +43 -0
- package/eigen/Eigen/LU +16 -20
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -54
- package/eigen/Eigen/PaStiXSupport +23 -20
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -21
- package/eigen/Eigen/QtAlignedMalloc +5 -13
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -18
- package/eigen/Eigen/Sparse +1 -4
- package/eigen/Eigen/SparseCholesky +18 -23
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +12 -8
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
- package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
- package/eigen/Eigen/src/Core/Array.h +341 -294
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
- package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
- package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
- package/eigen/Eigen/src/Core/Block.h +375 -398
- package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
- package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
- package/eigen/Eigen/src/Core/DenseBase.h +632 -571
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
- package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +169 -210
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +172 -222
- package/eigen/Eigen/src/Core/EigenBase.h +75 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
- package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
- package/eigen/Eigen/src/Core/IO.h +147 -139
- package/eigen/Eigen/src/Core/IndexedView.h +321 -0
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +56 -66
- package/eigen/Eigen/src/Core/Map.h +124 -142
- package/eigen/Eigen/src/Core/MapBase.h +256 -281
- package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
- package/eigen/Eigen/src/Core/Matrix.h +491 -416
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
- package/eigen/Eigen/src/Core/NestByValue.h +66 -85
- package/eigen/Eigen/src/Core/NoAlias.h +79 -85
- package/eigen/Eigen/src/Core/NumTraits.h +235 -148
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
- package/eigen/Eigen/src/Core/Product.h +260 -139
- package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
- package/eigen/Eigen/src/Core/Random.h +161 -136
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +366 -336
- package/eigen/Eigen/src/Core/Ref.h +308 -209
- package/eigen/Eigen/src/Core/Replicate.h +94 -106
- package/eigen/Eigen/src/Core/Reshaped.h +398 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
- package/eigen/Eigen/src/Core/Reverse.h +136 -145
- package/eigen/Eigen/src/Core/Select.h +70 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +97 -111
- package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
- package/eigen/Eigen/src/Core/SolverBase.h +138 -101
- package/eigen/Eigen/src/Core/StableNorm.h +156 -160
- package/eigen/Eigen/src/Core/StlIterators.h +619 -0
- package/eigen/Eigen/src/Core/Stride.h +91 -88
- package/eigen/Eigen/src/Core/Swap.h +70 -38
- package/eigen/Eigen/src/Core/Transpose.h +295 -273
- package/eigen/Eigen/src/Core/Transpositions.h +272 -317
- package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
- package/eigen/Eigen/src/Core/Visitor.h +480 -216
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
- package/eigen/Eigen/src/Core/util/Constants.h +314 -263
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
- package/eigen/Eigen/src/Core/util/Macros.h +939 -646
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
- package/eigen/Eigen/src/Core/util/Meta.h +618 -426
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
- package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
- package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
- package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
- package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
- package/eigen/Eigen/src/Geometry/Transform.h +896 -953
- package/eigen/Eigen/src/Geometry/Translation.h +100 -98
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
- package/eigen/Eigen/src/Householder/Householder.h +104 -122
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
- package/eigen/Eigen/src/LU/Determinant.h +60 -63
- package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
- package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
- package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
- package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
- package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// This file is part of Eigen, a lightweight C++ template library
|
|
2
2
|
// for linear algebra.
|
|
3
3
|
//
|
|
4
|
-
// Copyright (C) 2008-
|
|
4
|
+
// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
|
|
5
5
|
//
|
|
6
6
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
7
7
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
@@ -10,15 +10,61 @@
|
|
|
10
10
|
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
11
11
|
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
16
19
|
|
|
20
|
+
enum GEMVPacketSizeType { GEMVPacketFull = 0, GEMVPacketHalf, GEMVPacketQuarter };
|
|
21
|
+
|
|
22
|
+
template <int N, typename T1, typename T2, typename T3>
|
|
23
|
+
struct gemv_packet_cond {
|
|
24
|
+
typedef T3 type;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
template <typename T1, typename T2, typename T3>
|
|
28
|
+
struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
|
|
29
|
+
typedef T1 type;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
template <typename T1, typename T2, typename T3>
|
|
33
|
+
struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
|
|
34
|
+
typedef T2 type;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
template <typename LhsScalar, typename RhsScalar, int PacketSize_ = GEMVPacketFull>
|
|
38
|
+
class gemv_traits {
|
|
39
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
40
|
+
|
|
41
|
+
#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
|
|
42
|
+
typedef typename gemv_packet_cond< \
|
|
43
|
+
packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
|
|
44
|
+
typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
|
|
45
|
+
|
|
46
|
+
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
|
|
47
|
+
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
|
|
48
|
+
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
|
|
49
|
+
#undef PACKET_DECL_COND_POSTFIX
|
|
50
|
+
|
|
51
|
+
public:
|
|
52
|
+
enum {
|
|
53
|
+
Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable &&
|
|
54
|
+
int(unpacket_traits<LhsPacket_>::size) == int(unpacket_traits<RhsPacket_>::size),
|
|
55
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
|
|
56
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
|
|
57
|
+
ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
|
|
61
|
+
typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
|
|
62
|
+
typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
|
|
63
|
+
};
|
|
64
|
+
|
|
17
65
|
/* Optimized col-major matrix * vector product:
|
|
18
|
-
* This algorithm processes
|
|
19
|
-
*
|
|
20
|
-
* the instruction dependency. Moreover, we know that all bands have the
|
|
21
|
-
* same alignment pattern.
|
|
66
|
+
* This algorithm processes the matrix per vertical panels,
|
|
67
|
+
* which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
|
|
22
68
|
*
|
|
23
69
|
* Mixing type logic: C += alpha * A * B
|
|
24
70
|
* | A | B |alpha| comments
|
|
@@ -27,302 +73,193 @@ namespace internal {
|
|
|
27
73
|
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
|
28
74
|
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
|
29
75
|
*
|
|
30
|
-
* Accesses to the matrix coefficients follow the following logic:
|
|
31
|
-
*
|
|
32
|
-
* - if all columns have the same alignment then
|
|
33
|
-
* - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
|
|
34
|
-
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
|
35
|
-
* - otherwise
|
|
36
|
-
* - if even columns have the same alignment then
|
|
37
|
-
* // odd columns are guaranteed to have the same alignment too
|
|
38
|
-
* - if even or odd columns have the same alignment as the result, then
|
|
39
|
-
* // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
|
|
40
|
-
* - perform half aligned and half unaligned loads (-> EvenAligned case)
|
|
41
|
-
* - otherwise perform unaligned loads only (-> NoneAligned case)
|
|
42
|
-
* - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
|
|
43
|
-
* - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
|
|
44
|
-
* perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
|
|
45
|
-
* // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
|
|
46
|
-
* - otherwise,
|
|
47
|
-
* // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
|
|
48
|
-
* // we currently fall back to the NoneAligned case
|
|
49
|
-
*
|
|
50
76
|
* The same reasoning apply for the transposed case.
|
|
51
|
-
*
|
|
52
|
-
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
|
|
53
|
-
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
|
|
54
|
-
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
|
|
55
|
-
* compared to unaligned loads on a 4 byte boundary.
|
|
56
|
-
*
|
|
57
77
|
*/
|
|
58
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
59
|
-
|
|
60
|
-
|
|
78
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
79
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
80
|
+
struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,
|
|
81
|
+
ConjugateRhs, Version> {
|
|
82
|
+
typedef gemv_traits<LhsScalar, RhsScalar> Traits;
|
|
83
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
|
|
84
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
|
|
85
|
+
|
|
61
86
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
62
87
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
67
|
-
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
68
|
-
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
69
|
-
};
|
|
88
|
+
typedef typename Traits::LhsPacket LhsPacket;
|
|
89
|
+
typedef typename Traits::RhsPacket RhsPacket;
|
|
90
|
+
typedef typename Traits::ResPacket ResPacket;
|
|
70
91
|
|
|
71
|
-
typedef typename
|
|
72
|
-
typedef typename
|
|
73
|
-
typedef typename
|
|
92
|
+
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
|
|
93
|
+
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
|
|
94
|
+
typedef typename HalfTraits::ResPacket ResPacketHalf;
|
|
74
95
|
|
|
75
|
-
typedef typename
|
|
76
|
-
typedef typename
|
|
77
|
-
typedef typename
|
|
96
|
+
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
|
|
97
|
+
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
98
|
+
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
78
99
|
|
|
79
|
-
EIGEN_DONT_INLINE static void run(
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
const RhsMapper& rhs,
|
|
83
|
-
ResScalar* res, Index resIncr,
|
|
84
|
-
RhsScalar alpha);
|
|
100
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
|
|
101
|
+
const RhsMapper& rhs, ResScalar* res, Index resIncr,
|
|
102
|
+
RhsScalar alpha);
|
|
85
103
|
};
|
|
86
104
|
|
|
87
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
RhsScalar alpha)
|
|
94
|
-
{
|
|
105
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
106
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
107
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
|
|
108
|
+
general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
|
|
109
|
+
Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
|
|
110
|
+
ResScalar* res, Index resIncr, RhsScalar alpha) {
|
|
95
111
|
EIGEN_UNUSED_VARIABLE(resIncr);
|
|
96
|
-
eigen_internal_assert(resIncr==1);
|
|
97
|
-
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
|
98
|
-
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
99
|
-
#endif
|
|
100
|
-
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
|
|
101
|
-
pstore(&res[j], \
|
|
102
|
-
padd(pload<ResPacket>(&res[j]), \
|
|
103
|
-
padd( \
|
|
104
|
-
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
|
|
105
|
-
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
|
|
106
|
-
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
|
|
107
|
-
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
|
|
108
|
-
|
|
109
|
-
typedef typename LhsMapper::VectorMapper LhsScalars;
|
|
110
|
-
|
|
111
|
-
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
112
|
-
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
113
|
-
if(ConjugateRhs)
|
|
114
|
-
alpha = numext::conj(alpha);
|
|
115
|
-
|
|
116
|
-
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
|
117
|
-
const Index columnsAtOnce = 4;
|
|
118
|
-
const Index peels = 2;
|
|
119
|
-
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
120
|
-
const Index ResPacketAlignedMask = ResPacketSize-1;
|
|
121
|
-
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
|
122
|
-
const Index size = rows;
|
|
112
|
+
eigen_internal_assert(resIncr == 1);
|
|
123
113
|
|
|
124
|
-
|
|
114
|
+
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
115
|
+
// This helps GCC to generate proper code.
|
|
116
|
+
LhsMapper lhs(alhs);
|
|
125
117
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
118
|
+
conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
|
|
119
|
+
conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
|
|
120
|
+
conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
|
|
121
|
+
conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
|
|
122
|
+
|
|
123
|
+
const Index lhsStride = lhs.stride();
|
|
124
|
+
// TODO: for padded aligned inputs, we could enable aligned reads
|
|
125
|
+
enum {
|
|
126
|
+
LhsAlignment = Unaligned,
|
|
127
|
+
ResPacketSize = Traits::ResPacketSize,
|
|
128
|
+
ResPacketSizeHalf = HalfTraits::ResPacketSize,
|
|
129
|
+
ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
|
|
130
|
+
LhsPacketSize = Traits::LhsPacketSize,
|
|
131
|
+
HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
|
|
132
|
+
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
const Index n8 = rows - 8 * ResPacketSize + 1;
|
|
136
|
+
const Index n4 = rows - 4 * ResPacketSize + 1;
|
|
137
|
+
const Index n3 = rows - 3 * ResPacketSize + 1;
|
|
138
|
+
const Index n2 = rows - 2 * ResPacketSize + 1;
|
|
139
|
+
const Index n1 = rows - 1 * ResPacketSize + 1;
|
|
140
|
+
const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
|
|
141
|
+
const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
|
|
142
|
+
|
|
143
|
+
// TODO: improve the following heuristic:
|
|
144
|
+
const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
|
|
145
|
+
ResPacket palpha = pset1<ResPacket>(alpha);
|
|
146
|
+
ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
|
|
147
|
+
ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
|
|
148
|
+
|
|
149
|
+
for (Index j2 = 0; j2 < cols; j2 += block_cols) {
|
|
150
|
+
Index jend = numext::mini(j2 + block_cols, cols);
|
|
151
|
+
Index i = 0;
|
|
152
|
+
for (; i < n8; i += ResPacketSize * 8) {
|
|
153
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
154
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
|
|
155
|
+
c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
|
|
156
|
+
c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
|
|
157
|
+
|
|
158
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
159
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
160
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
161
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
162
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
|
|
163
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
|
|
164
|
+
c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
|
|
165
|
+
c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
|
|
166
|
+
c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
|
|
167
|
+
c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
|
|
168
|
+
}
|
|
169
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
170
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
171
|
+
pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
|
|
172
|
+
pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
|
|
173
|
+
pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
|
|
174
|
+
pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
|
|
175
|
+
pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
|
|
176
|
+
pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
|
|
167
177
|
}
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
178
|
+
if (i < n4) {
|
|
179
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
180
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
|
|
181
|
+
|
|
182
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
183
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
184
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
185
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
186
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
|
|
187
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
|
|
188
|
+
}
|
|
189
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
190
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
191
|
+
pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
|
|
192
|
+
pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
|
|
193
|
+
|
|
194
|
+
i += ResPacketSize * 4;
|
|
172
195
|
}
|
|
196
|
+
if (i < n3) {
|
|
197
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
198
|
+
c2 = pset1<ResPacket>(ResScalar(0));
|
|
199
|
+
|
|
200
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
201
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
202
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
203
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
204
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
|
|
205
|
+
}
|
|
206
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
207
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
208
|
+
pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
|
|
173
209
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
}
|
|
179
|
-
else if(Vectorizable)
|
|
180
|
-
{
|
|
181
|
-
alignedStart = 0;
|
|
182
|
-
alignedSize = size;
|
|
183
|
-
alignmentPattern = AllAligned;
|
|
184
|
-
}
|
|
210
|
+
i += ResPacketSize * 3;
|
|
211
|
+
}
|
|
212
|
+
if (i < n2) {
|
|
213
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
|
|
185
214
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
|
191
|
-
{
|
|
192
|
-
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
|
|
193
|
-
ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
|
|
194
|
-
ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
|
|
195
|
-
ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
|
|
196
|
-
|
|
197
|
-
// this helps a lot generating better binary code
|
|
198
|
-
const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
|
|
199
|
-
lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
|
|
200
|
-
|
|
201
|
-
if (Vectorizable)
|
|
202
|
-
{
|
|
203
|
-
/* explicit vectorization */
|
|
204
|
-
// process initial unaligned coeffs
|
|
205
|
-
for (Index j=0; j<alignedStart; ++j)
|
|
206
|
-
{
|
|
207
|
-
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
|
208
|
-
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
|
209
|
-
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
|
210
|
-
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
|
215
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
216
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
217
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
218
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
211
219
|
}
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
case EvenAligned:
|
|
222
|
-
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
223
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
|
224
|
-
break;
|
|
225
|
-
case FirstAligned:
|
|
226
|
-
{
|
|
227
|
-
Index j = alignedStart;
|
|
228
|
-
if(peels>1)
|
|
229
|
-
{
|
|
230
|
-
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
|
231
|
-
ResPacket T0, T1;
|
|
232
|
-
|
|
233
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
|
234
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
|
235
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
|
236
|
-
|
|
237
|
-
for (; j<peeledSize; j+=peels*ResPacketSize)
|
|
238
|
-
{
|
|
239
|
-
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
|
240
|
-
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
|
241
|
-
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
|
242
|
-
|
|
243
|
-
A00 = lhs0.template load<LhsPacket, Aligned>(j);
|
|
244
|
-
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
|
|
245
|
-
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
|
246
|
-
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
|
247
|
-
|
|
248
|
-
T0 = pcj.pmadd(A01, ptmp1, T0);
|
|
249
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
|
250
|
-
T0 = pcj.pmadd(A02, ptmp2, T0);
|
|
251
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
|
252
|
-
T0 = pcj.pmadd(A03, ptmp3, T0);
|
|
253
|
-
pstore(&res[j],T0);
|
|
254
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
|
255
|
-
T1 = pcj.pmadd(A11, ptmp1, T1);
|
|
256
|
-
T1 = pcj.pmadd(A12, ptmp2, T1);
|
|
257
|
-
T1 = pcj.pmadd(A13, ptmp3, T1);
|
|
258
|
-
pstore(&res[j+ResPacketSize],T1);
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
for (; j<alignedSize; j+=ResPacketSize)
|
|
262
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
|
263
|
-
break;
|
|
264
|
-
}
|
|
265
|
-
default:
|
|
266
|
-
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
|
267
|
-
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
|
268
|
-
break;
|
|
269
|
-
}
|
|
220
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
221
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
222
|
+
i += ResPacketSize * 2;
|
|
223
|
+
}
|
|
224
|
+
if (i < n1) {
|
|
225
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0));
|
|
226
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
227
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
228
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
270
229
|
}
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
/* process remaining coeffs (or all if there is no explicit vectorization) */
|
|
274
|
-
for (Index j=alignedSize; j<size; ++j)
|
|
275
|
-
{
|
|
276
|
-
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
|
277
|
-
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
|
278
|
-
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
|
279
|
-
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
|
230
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
231
|
+
i += ResPacketSize;
|
|
280
232
|
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
do
|
|
287
|
-
{
|
|
288
|
-
for (Index k=start; k<end; ++k)
|
|
289
|
-
{
|
|
290
|
-
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
|
|
291
|
-
const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
|
|
292
|
-
|
|
293
|
-
if (Vectorizable)
|
|
294
|
-
{
|
|
295
|
-
/* explicit vectorization */
|
|
296
|
-
// process first unaligned result's coeffs
|
|
297
|
-
for (Index j=0; j<alignedStart; ++j)
|
|
298
|
-
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
|
|
299
|
-
// process aligned result's coeffs
|
|
300
|
-
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
|
301
|
-
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
302
|
-
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
|
303
|
-
else
|
|
304
|
-
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
|
305
|
-
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
|
233
|
+
if (HasHalf && i < n_half) {
|
|
234
|
+
ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
|
|
235
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
236
|
+
RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
|
|
237
|
+
c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
|
|
306
238
|
}
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
239
|
+
pstoreu(res + i + ResPacketSizeHalf * 0,
|
|
240
|
+
pmadd(c0, palpha_half, ploadu<ResPacketHalf>(res + i + ResPacketSizeHalf * 0)));
|
|
241
|
+
i += ResPacketSizeHalf;
|
|
242
|
+
}
|
|
243
|
+
if (HasQuarter && i < n_quarter) {
|
|
244
|
+
ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
|
|
245
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
246
|
+
RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
|
|
247
|
+
c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
|
|
248
|
+
}
|
|
249
|
+
pstoreu(res + i + ResPacketSizeQuarter * 0,
|
|
250
|
+
pmadd(c0, palpha_quarter, ploadu<ResPacketQuarter>(res + i + ResPacketSizeQuarter * 0)));
|
|
251
|
+
i += ResPacketSizeQuarter;
|
|
311
252
|
}
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
skipColumns = 0;
|
|
253
|
+
for (; i < rows; ++i) {
|
|
254
|
+
ResScalar c0(0);
|
|
255
|
+
for (Index j = j2; j < jend; j += 1) c0 += cj.pmul(lhs(i, j), rhs(j, 0));
|
|
256
|
+
res[i] += alpha * c0;
|
|
317
257
|
}
|
|
318
|
-
|
|
319
|
-
break;
|
|
320
|
-
} while(Vectorizable);
|
|
321
|
-
#undef _EIGEN_ACCUMULATE_PACKETS
|
|
258
|
+
}
|
|
322
259
|
}
|
|
323
260
|
|
|
324
261
|
/* Optimized row-major matrix * vector product:
|
|
325
|
-
* This algorithm processes 4 rows at
|
|
262
|
+
* This algorithm processes 4 rows at once that allows to both reduce
|
|
326
263
|
* the number of load/stores of the result by a factor 4 and to reduce
|
|
327
264
|
* the instruction dependency. Moreover, we know that all bands have the
|
|
328
265
|
* same alignment pattern.
|
|
@@ -331,289 +268,206 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
|
|
|
331
268
|
* - alpha is always a complex (or converted to a complex)
|
|
332
269
|
* - no vectorization
|
|
333
270
|
*/
|
|
334
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
&& int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
|
342
|
-
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
|
343
|
-
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
|
344
|
-
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
|
345
|
-
};
|
|
271
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
272
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
273
|
+
struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,
|
|
274
|
+
ConjugateRhs, Version> {
|
|
275
|
+
typedef gemv_traits<LhsScalar, RhsScalar> Traits;
|
|
276
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
|
|
277
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
|
|
346
278
|
|
|
347
|
-
typedef typename
|
|
348
|
-
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
|
349
|
-
typedef typename packet_traits<ResScalar>::type _ResPacket;
|
|
279
|
+
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
350
280
|
|
|
351
|
-
typedef typename
|
|
352
|
-
typedef typename
|
|
353
|
-
typedef typename
|
|
281
|
+
typedef typename Traits::LhsPacket LhsPacket;
|
|
282
|
+
typedef typename Traits::RhsPacket RhsPacket;
|
|
283
|
+
typedef typename Traits::ResPacket ResPacket;
|
|
354
284
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
const RhsMapper& rhs,
|
|
359
|
-
ResScalar* res, Index resIncr,
|
|
360
|
-
ResScalar alpha);
|
|
361
|
-
};
|
|
285
|
+
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
|
|
286
|
+
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
|
|
287
|
+
typedef typename HalfTraits::ResPacket ResPacketHalf;
|
|
362
288
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
const LhsMapper& lhs,
|
|
367
|
-
const RhsMapper& rhs,
|
|
368
|
-
ResScalar* res, Index resIncr,
|
|
369
|
-
ResScalar alpha)
|
|
370
|
-
{
|
|
371
|
-
eigen_internal_assert(rhs.stride()==1);
|
|
372
|
-
|
|
373
|
-
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
|
374
|
-
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
|
375
|
-
#endif
|
|
376
|
-
|
|
377
|
-
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
|
|
378
|
-
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
|
|
379
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
|
|
380
|
-
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
|
|
381
|
-
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
|
|
382
|
-
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
|
|
383
|
-
|
|
384
|
-
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
385
|
-
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
386
|
-
|
|
387
|
-
typedef typename LhsMapper::VectorMapper LhsScalars;
|
|
388
|
-
|
|
389
|
-
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
|
390
|
-
const Index rowsAtOnce = 4;
|
|
391
|
-
const Index peels = 2;
|
|
392
|
-
const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
|
393
|
-
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
|
394
|
-
const Index depth = cols;
|
|
395
|
-
const Index lhsStride = lhs.stride();
|
|
289
|
+
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
|
|
290
|
+
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
291
|
+
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
396
292
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
//
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
{
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
293
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
|
|
294
|
+
const RhsMapper& rhs, ResScalar* res, Index resIncr,
|
|
295
|
+
ResScalar alpha);
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
299
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
300
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
|
|
301
|
+
general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
|
|
302
|
+
Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
|
|
303
|
+
ResScalar* res, Index resIncr, ResScalar alpha) {
|
|
304
|
+
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
305
|
+
// This helps GCC to generate proper code.
|
|
306
|
+
LhsMapper lhs(alhs);
|
|
307
|
+
|
|
308
|
+
eigen_internal_assert(rhs.stride() == 1);
|
|
309
|
+
conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
|
|
310
|
+
conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
|
|
311
|
+
conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
|
|
312
|
+
conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
|
|
313
|
+
|
|
314
|
+
// TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
|
|
315
|
+
// processing 8 rows at once might be counter productive wrt cache.
|
|
316
|
+
const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
|
|
317
|
+
const Index n4 = rows - 3;
|
|
318
|
+
const Index n2 = rows - 1;
|
|
319
|
+
|
|
320
|
+
// TODO: for padded aligned inputs, we could enable aligned reads
|
|
321
|
+
enum {
|
|
322
|
+
LhsAlignment = Unaligned,
|
|
323
|
+
ResPacketSize = Traits::ResPacketSize,
|
|
324
|
+
ResPacketSizeHalf = HalfTraits::ResPacketSize,
|
|
325
|
+
ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
|
|
326
|
+
LhsPacketSize = Traits::LhsPacketSize,
|
|
327
|
+
LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
|
|
328
|
+
LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
|
|
329
|
+
HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
|
|
330
|
+
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
|
|
331
|
+
};
|
|
332
|
+
|
|
333
|
+
using UnsignedIndex = typename make_unsigned<Index>::type;
|
|
334
|
+
const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
|
|
335
|
+
const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
|
|
336
|
+
const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
|
|
337
|
+
|
|
338
|
+
Index i = 0;
|
|
339
|
+
for (; i < n8; i += 8) {
|
|
340
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
341
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
|
|
342
|
+
c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
|
|
343
|
+
c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
|
|
344
|
+
|
|
345
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
346
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
347
|
+
|
|
348
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
349
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
|
|
350
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
|
|
351
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
|
|
352
|
+
c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 4, j), b0, c4);
|
|
353
|
+
c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 5, j), b0, c5);
|
|
354
|
+
c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 6, j), b0, c6);
|
|
355
|
+
c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 7, j), b0, c7);
|
|
441
356
|
}
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
357
|
+
ResScalar cc0 = predux(c0);
|
|
358
|
+
ResScalar cc1 = predux(c1);
|
|
359
|
+
ResScalar cc2 = predux(c2);
|
|
360
|
+
ResScalar cc3 = predux(c3);
|
|
361
|
+
ResScalar cc4 = predux(c4);
|
|
362
|
+
ResScalar cc5 = predux(c5);
|
|
363
|
+
ResScalar cc6 = predux(c6);
|
|
364
|
+
ResScalar cc7 = predux(c7);
|
|
365
|
+
|
|
366
|
+
for (Index j = fullColBlockEnd; j < cols; ++j) {
|
|
367
|
+
RhsScalar b0 = rhs(j, 0);
|
|
368
|
+
|
|
369
|
+
cc0 += cj.pmul(lhs(i + 0, j), b0);
|
|
370
|
+
cc1 += cj.pmul(lhs(i + 1, j), b0);
|
|
371
|
+
cc2 += cj.pmul(lhs(i + 2, j), b0);
|
|
372
|
+
cc3 += cj.pmul(lhs(i + 3, j), b0);
|
|
373
|
+
cc4 += cj.pmul(lhs(i + 4, j), b0);
|
|
374
|
+
cc5 += cj.pmul(lhs(i + 5, j), b0);
|
|
375
|
+
cc6 += cj.pmul(lhs(i + 6, j), b0);
|
|
376
|
+
cc7 += cj.pmul(lhs(i + 7, j), b0);
|
|
446
377
|
}
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
alignedStart = 0;
|
|
456
|
-
alignedSize = depth;
|
|
457
|
-
alignmentPattern = AllAligned;
|
|
378
|
+
res[(i + 0) * resIncr] += alpha * cc0;
|
|
379
|
+
res[(i + 1) * resIncr] += alpha * cc1;
|
|
380
|
+
res[(i + 2) * resIncr] += alpha * cc2;
|
|
381
|
+
res[(i + 3) * resIncr] += alpha * cc3;
|
|
382
|
+
res[(i + 4) * resIncr] += alpha * cc4;
|
|
383
|
+
res[(i + 5) * resIncr] += alpha * cc5;
|
|
384
|
+
res[(i + 6) * resIncr] += alpha * cc6;
|
|
385
|
+
res[(i + 7) * resIncr] += alpha * cc7;
|
|
458
386
|
}
|
|
387
|
+
for (; i < n4; i += 4) {
|
|
388
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
389
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
|
|
459
390
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
|
464
|
-
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
|
465
|
-
{
|
|
466
|
-
// FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
|
|
467
|
-
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
|
468
|
-
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
|
469
|
-
|
|
470
|
-
// this helps the compiler generating good binary code
|
|
471
|
-
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
|
|
472
|
-
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
|
|
473
|
-
|
|
474
|
-
if (Vectorizable)
|
|
475
|
-
{
|
|
476
|
-
/* explicit vectorization */
|
|
477
|
-
ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
|
478
|
-
ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
|
479
|
-
|
|
480
|
-
// process initial unaligned coeffs
|
|
481
|
-
// FIXME this loop get vectorized by the compiler !
|
|
482
|
-
for (Index j=0; j<alignedStart; ++j)
|
|
483
|
-
{
|
|
484
|
-
RhsScalar b = rhs(j, 0);
|
|
485
|
-
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
|
486
|
-
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
|
487
|
-
}
|
|
391
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
392
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
488
393
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
/* Here we proccess 4 rows with with two peeled iterations to hide
|
|
507
|
-
* the overhead of unaligned loads. Moreover unaligned loads are handled
|
|
508
|
-
* using special shift/move operations between the two aligned packets
|
|
509
|
-
* overlaping the desired unaligned packet. This is *much* more efficient
|
|
510
|
-
* than basic unaligned loads.
|
|
511
|
-
*/
|
|
512
|
-
LhsPacket A01, A02, A03, A11, A12, A13;
|
|
513
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
|
514
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
|
515
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
|
516
|
-
|
|
517
|
-
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
|
518
|
-
{
|
|
519
|
-
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
|
|
520
|
-
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
|
521
|
-
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
|
522
|
-
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
|
523
|
-
|
|
524
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
|
|
525
|
-
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
|
526
|
-
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
|
527
|
-
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
|
528
|
-
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
|
529
|
-
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
|
530
|
-
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
|
531
|
-
|
|
532
|
-
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
|
|
533
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
|
|
534
|
-
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
|
535
|
-
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
|
536
|
-
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
|
537
|
-
}
|
|
538
|
-
}
|
|
539
|
-
for (; j<alignedSize; j+=RhsPacketSize)
|
|
540
|
-
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
|
541
|
-
break;
|
|
542
|
-
}
|
|
543
|
-
default:
|
|
544
|
-
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
|
545
|
-
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
|
546
|
-
break;
|
|
547
|
-
}
|
|
548
|
-
tmp0 += predux(ptmp0);
|
|
549
|
-
tmp1 += predux(ptmp1);
|
|
550
|
-
tmp2 += predux(ptmp2);
|
|
551
|
-
tmp3 += predux(ptmp3);
|
|
552
|
-
}
|
|
553
|
-
} // end explicit vectorization
|
|
554
|
-
|
|
555
|
-
// process remaining coeffs (or all if no explicit vectorization)
|
|
556
|
-
// FIXME this loop get vectorized by the compiler !
|
|
557
|
-
for (Index j=alignedSize; j<depth; ++j)
|
|
558
|
-
{
|
|
559
|
-
RhsScalar b = rhs(j, 0);
|
|
560
|
-
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
|
561
|
-
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
|
394
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
395
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
|
|
396
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
|
|
397
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
|
|
398
|
+
}
|
|
399
|
+
ResScalar cc0 = predux(c0);
|
|
400
|
+
ResScalar cc1 = predux(c1);
|
|
401
|
+
ResScalar cc2 = predux(c2);
|
|
402
|
+
ResScalar cc3 = predux(c3);
|
|
403
|
+
|
|
404
|
+
for (Index j = fullColBlockEnd; j < cols; ++j) {
|
|
405
|
+
RhsScalar b0 = rhs(j, 0);
|
|
406
|
+
|
|
407
|
+
cc0 += cj.pmul(lhs(i + 0, j), b0);
|
|
408
|
+
cc1 += cj.pmul(lhs(i + 1, j), b0);
|
|
409
|
+
cc2 += cj.pmul(lhs(i + 2, j), b0);
|
|
410
|
+
cc3 += cj.pmul(lhs(i + 3, j), b0);
|
|
562
411
|
}
|
|
563
|
-
res[i*resIncr]
|
|
564
|
-
res[(i+
|
|
565
|
-
res[(i+2)*resIncr]
|
|
566
|
-
res[(i+
|
|
412
|
+
res[(i + 0) * resIncr] += alpha * cc0;
|
|
413
|
+
res[(i + 1) * resIncr] += alpha * cc1;
|
|
414
|
+
res[(i + 2) * resIncr] += alpha * cc2;
|
|
415
|
+
res[(i + 3) * resIncr] += alpha * cc3;
|
|
567
416
|
}
|
|
417
|
+
for (; i < n2; i += 2) {
|
|
418
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
|
|
568
419
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
Index start = rowBound;
|
|
572
|
-
do
|
|
573
|
-
{
|
|
574
|
-
for (Index i=start; i<end; ++i)
|
|
575
|
-
{
|
|
576
|
-
EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
|
|
577
|
-
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
|
578
|
-
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
|
|
579
|
-
// process first unaligned result's coeffs
|
|
580
|
-
// FIXME this loop get vectorized by the compiler !
|
|
581
|
-
for (Index j=0; j<alignedStart; ++j)
|
|
582
|
-
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
|
583
|
-
|
|
584
|
-
if (alignedSize>alignedStart)
|
|
585
|
-
{
|
|
586
|
-
// process aligned rhs coeffs
|
|
587
|
-
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
|
588
|
-
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
589
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
|
590
|
-
else
|
|
591
|
-
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
|
592
|
-
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
|
593
|
-
tmp0 += predux(ptmp0);
|
|
594
|
-
}
|
|
420
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
421
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
595
422
|
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
for (Index j=alignedSize; j<depth; ++j)
|
|
599
|
-
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
|
600
|
-
res[i*resIncr] += alpha*tmp0;
|
|
601
|
-
}
|
|
602
|
-
if (skipRows)
|
|
603
|
-
{
|
|
604
|
-
start = 0;
|
|
605
|
-
end = skipRows;
|
|
606
|
-
skipRows = 0;
|
|
423
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
424
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
|
|
607
425
|
}
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
} while(Vectorizable);
|
|
426
|
+
ResScalar cc0 = predux(c0);
|
|
427
|
+
ResScalar cc1 = predux(c1);
|
|
611
428
|
|
|
612
|
-
|
|
429
|
+
for (Index j = fullColBlockEnd; j < cols; ++j) {
|
|
430
|
+
RhsScalar b0 = rhs(j, 0);
|
|
431
|
+
|
|
432
|
+
cc0 += cj.pmul(lhs(i + 0, j), b0);
|
|
433
|
+
cc1 += cj.pmul(lhs(i + 1, j), b0);
|
|
434
|
+
}
|
|
435
|
+
res[(i + 0) * resIncr] += alpha * cc0;
|
|
436
|
+
res[(i + 1) * resIncr] += alpha * cc1;
|
|
437
|
+
}
|
|
438
|
+
for (; i < rows; ++i) {
|
|
439
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0));
|
|
440
|
+
ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
|
|
441
|
+
ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
|
|
442
|
+
|
|
443
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
444
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
445
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i, j), b0, c0);
|
|
446
|
+
}
|
|
447
|
+
ResScalar cc0 = predux(c0);
|
|
448
|
+
if (HasHalf) {
|
|
449
|
+
for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
|
|
450
|
+
RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
|
|
451
|
+
c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i, j), b0, c0_h);
|
|
452
|
+
}
|
|
453
|
+
cc0 += predux(c0_h);
|
|
454
|
+
}
|
|
455
|
+
if (HasQuarter) {
|
|
456
|
+
for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
|
|
457
|
+
RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
|
|
458
|
+
c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i, j), b0, c0_q);
|
|
459
|
+
}
|
|
460
|
+
cc0 += predux(c0_q);
|
|
461
|
+
}
|
|
462
|
+
for (Index j = quarterColBlockEnd; j < cols; ++j) {
|
|
463
|
+
cc0 += cj.pmul(lhs(i, j), rhs(j, 0));
|
|
464
|
+
}
|
|
465
|
+
res[i * resIncr] += alpha * cc0;
|
|
466
|
+
}
|
|
613
467
|
}
|
|
614
468
|
|
|
615
|
-
}
|
|
469
|
+
} // end namespace internal
|
|
616
470
|
|
|
617
|
-
}
|
|
471
|
+
} // end namespace Eigen
|
|
618
472
|
|
|
619
|
-
#endif
|
|
473
|
+
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|