@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -10,61 +10,61 @@
|
|
|
10
10
|
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
11
11
|
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
16
19
|
|
|
17
|
-
enum GEMVPacketSizeType {
|
|
18
|
-
GEMVPacketFull = 0,
|
|
19
|
-
GEMVPacketHalf,
|
|
20
|
-
GEMVPacketQuarter
|
|
21
|
-
};
|
|
20
|
+
enum GEMVPacketSizeType { GEMVPacketFull = 0, GEMVPacketHalf, GEMVPacketQuarter };
|
|
22
21
|
|
|
23
22
|
template <int N, typename T1, typename T2, typename T3>
|
|
24
|
-
struct gemv_packet_cond {
|
|
23
|
+
struct gemv_packet_cond {
|
|
24
|
+
typedef T3 type;
|
|
25
|
+
};
|
|
25
26
|
|
|
26
27
|
template <typename T1, typename T2, typename T3>
|
|
27
|
-
struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
|
|
28
|
+
struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
|
|
29
|
+
typedef T1 type;
|
|
30
|
+
};
|
|
28
31
|
|
|
29
32
|
template <typename T1, typename T2, typename T3>
|
|
30
|
-
struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
|
|
33
|
+
struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
|
|
34
|
+
typedef T2 type;
|
|
35
|
+
};
|
|
31
36
|
|
|
32
|
-
template<typename LhsScalar, typename RhsScalar, int
|
|
33
|
-
class gemv_traits
|
|
34
|
-
{
|
|
37
|
+
template <typename LhsScalar, typename RhsScalar, int PacketSize_ = GEMVPacketFull>
|
|
38
|
+
class gemv_traits {
|
|
35
39
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
36
40
|
|
|
37
|
-
#define
|
|
38
|
-
typedef typename gemv_packet_cond<
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
|
|
42
|
-
prefix ## name ## Packet
|
|
41
|
+
#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
|
|
42
|
+
typedef typename gemv_packet_cond< \
|
|
43
|
+
packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
|
|
44
|
+
typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
|
|
43
45
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
#undef
|
|
46
|
+
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
|
|
47
|
+
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
|
|
48
|
+
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
|
|
49
|
+
#undef PACKET_DECL_COND_POSTFIX
|
|
48
50
|
|
|
49
|
-
public:
|
|
51
|
+
public:
|
|
50
52
|
enum {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
|
|
53
|
+
Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable &&
|
|
54
|
+
int(unpacket_traits<LhsPacket_>::size) == int(unpacket_traits<RhsPacket_>::size),
|
|
55
|
+
LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
|
|
56
|
+
RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
|
|
57
|
+
ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1
|
|
57
58
|
};
|
|
58
59
|
|
|
59
|
-
typedef
|
|
60
|
-
typedef
|
|
61
|
-
typedef
|
|
60
|
+
typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
|
|
61
|
+
typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
|
|
62
|
+
typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
|
|
62
63
|
};
|
|
63
64
|
|
|
64
|
-
|
|
65
65
|
/* Optimized col-major matrix * vector product:
|
|
66
66
|
* This algorithm processes the matrix per vertical panels,
|
|
67
|
-
* which are then processed
|
|
67
|
+
* which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
|
|
68
68
|
*
|
|
69
69
|
* Mixing type logic: C += alpha * A * B
|
|
70
70
|
* | A | B |alpha| comments
|
|
@@ -75,12 +75,13 @@ public:
|
|
|
75
75
|
*
|
|
76
76
|
* The same reasoning apply for the transposed case.
|
|
77
77
|
*/
|
|
78
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
typedef gemv_traits<LhsScalar,RhsScalar
|
|
83
|
-
typedef gemv_traits<LhsScalar,RhsScalar,
|
|
78
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
79
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
80
|
+
struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,
|
|
81
|
+
ConjugateRhs, Version> {
|
|
82
|
+
typedef gemv_traits<LhsScalar, RhsScalar> Traits;
|
|
83
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
|
|
84
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
|
|
84
85
|
|
|
85
86
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
86
87
|
|
|
@@ -96,190 +97,163 @@ struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugat
|
|
|
96
97
|
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
97
98
|
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
98
99
|
|
|
99
|
-
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
const RhsMapper& rhs,
|
|
103
|
-
ResScalar* res, Index resIncr,
|
|
104
|
-
RhsScalar alpha);
|
|
100
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
|
|
101
|
+
const RhsMapper& rhs, ResScalar* res, Index resIncr,
|
|
102
|
+
RhsScalar alpha);
|
|
105
103
|
};
|
|
106
104
|
|
|
107
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
RhsScalar alpha)
|
|
114
|
-
{
|
|
105
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
106
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
107
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
|
|
108
|
+
general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
|
|
109
|
+
Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
|
|
110
|
+
ResScalar* res, Index resIncr, RhsScalar alpha) {
|
|
115
111
|
EIGEN_UNUSED_VARIABLE(resIncr);
|
|
116
|
-
eigen_internal_assert(resIncr==1);
|
|
112
|
+
eigen_internal_assert(resIncr == 1);
|
|
117
113
|
|
|
118
114
|
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
119
|
-
// This helps GCC to generate
|
|
115
|
+
// This helps GCC to generate proper code.
|
|
120
116
|
LhsMapper lhs(alhs);
|
|
121
117
|
|
|
122
|
-
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
123
|
-
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
124
|
-
conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
|
|
125
|
-
conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
|
|
118
|
+
conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
|
|
119
|
+
conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
|
|
120
|
+
conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
|
|
121
|
+
conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
|
|
126
122
|
|
|
127
123
|
const Index lhsStride = lhs.stride();
|
|
128
124
|
// TODO: for padded aligned inputs, we could enable aligned reads
|
|
129
|
-
enum {
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
125
|
+
enum {
|
|
126
|
+
LhsAlignment = Unaligned,
|
|
127
|
+
ResPacketSize = Traits::ResPacketSize,
|
|
128
|
+
ResPacketSizeHalf = HalfTraits::ResPacketSize,
|
|
129
|
+
ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
|
|
130
|
+
LhsPacketSize = Traits::LhsPacketSize,
|
|
131
|
+
HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
|
|
132
|
+
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
|
|
136
133
|
};
|
|
137
134
|
|
|
138
|
-
const Index n8 = rows-8*ResPacketSize+1;
|
|
139
|
-
const Index n4 = rows-4*ResPacketSize+1;
|
|
140
|
-
const Index n3 = rows-3*ResPacketSize+1;
|
|
141
|
-
const Index n2 = rows-2*ResPacketSize+1;
|
|
142
|
-
const Index n1 = rows-1*ResPacketSize+1;
|
|
143
|
-
const Index n_half = rows-1*ResPacketSizeHalf+1;
|
|
144
|
-
const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
|
|
135
|
+
const Index n8 = rows - 8 * ResPacketSize + 1;
|
|
136
|
+
const Index n4 = rows - 4 * ResPacketSize + 1;
|
|
137
|
+
const Index n3 = rows - 3 * ResPacketSize + 1;
|
|
138
|
+
const Index n2 = rows - 2 * ResPacketSize + 1;
|
|
139
|
+
const Index n1 = rows - 1 * ResPacketSize + 1;
|
|
140
|
+
const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
|
|
141
|
+
const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
|
|
145
142
|
|
|
146
143
|
// TODO: improve the following heuristic:
|
|
147
|
-
const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
|
|
144
|
+
const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
|
|
148
145
|
ResPacket palpha = pset1<ResPacket>(alpha);
|
|
149
146
|
ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
|
|
150
147
|
ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
|
|
151
148
|
|
|
152
|
-
for(Index j2=0; j2<cols; j2+=block_cols)
|
|
153
|
-
|
|
154
|
-
Index
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
172
|
-
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
|
|
173
|
-
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
|
|
174
|
-
c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
|
|
175
|
-
c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
|
|
176
|
-
c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
|
|
177
|
-
c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
|
|
149
|
+
for (Index j2 = 0; j2 < cols; j2 += block_cols) {
|
|
150
|
+
Index jend = numext::mini(j2 + block_cols, cols);
|
|
151
|
+
Index i = 0;
|
|
152
|
+
for (; i < n8; i += ResPacketSize * 8) {
|
|
153
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
154
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
|
|
155
|
+
c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
|
|
156
|
+
c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
|
|
157
|
+
|
|
158
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
159
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
160
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
161
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
162
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
|
|
163
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
|
|
164
|
+
c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
|
|
165
|
+
c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
|
|
166
|
+
c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
|
|
167
|
+
c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
|
|
178
168
|
}
|
|
179
|
-
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
180
|
-
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
181
|
-
pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
|
|
182
|
-
pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
|
|
183
|
-
pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
|
|
184
|
-
pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
|
|
185
|
-
pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
|
|
186
|
-
pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
|
|
169
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
170
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
171
|
+
pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
|
|
172
|
+
pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
|
|
173
|
+
pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
|
|
174
|
+
pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
|
|
175
|
+
pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
|
|
176
|
+
pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
|
|
187
177
|
}
|
|
188
|
-
if(i<n4)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
|
|
199
|
-
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
200
|
-
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
|
|
201
|
-
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
|
|
178
|
+
if (i < n4) {
|
|
179
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
180
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
|
|
181
|
+
|
|
182
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
183
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
184
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
185
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
186
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
|
|
187
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
|
|
202
188
|
}
|
|
203
|
-
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
204
|
-
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
205
|
-
pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
|
|
206
|
-
pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
|
|
189
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
190
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
191
|
+
pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
|
|
192
|
+
pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
|
|
207
193
|
|
|
208
|
-
i+=ResPacketSize*4;
|
|
194
|
+
i += ResPacketSize * 4;
|
|
209
195
|
}
|
|
210
|
-
if(i<n3)
|
|
211
|
-
|
|
212
|
-
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
|
213
|
-
c1 = pset1<ResPacket>(ResScalar(0)),
|
|
196
|
+
if (i < n3) {
|
|
197
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
214
198
|
c2 = pset1<ResPacket>(ResScalar(0));
|
|
215
199
|
|
|
216
|
-
for(Index j=j2; j<jend; j+=1)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
|
|
200
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
201
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
202
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
203
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
204
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
|
|
222
205
|
}
|
|
223
|
-
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
224
|
-
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
225
|
-
pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
|
|
206
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
207
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
208
|
+
pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
|
|
226
209
|
|
|
227
|
-
i+=ResPacketSize*3;
|
|
210
|
+
i += ResPacketSize * 3;
|
|
228
211
|
}
|
|
229
|
-
if(i<n2)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
|
|
237
|
-
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
|
|
238
|
-
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
|
|
212
|
+
if (i < n2) {
|
|
213
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
|
|
214
|
+
|
|
215
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
216
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
217
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
|
|
218
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
|
|
239
219
|
}
|
|
240
|
-
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
241
|
-
pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
|
|
242
|
-
i+=ResPacketSize*2;
|
|
220
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
221
|
+
pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
|
|
222
|
+
i += ResPacketSize * 2;
|
|
243
223
|
}
|
|
244
|
-
if(i<n1)
|
|
245
|
-
{
|
|
224
|
+
if (i < n1) {
|
|
246
225
|
ResPacket c0 = pset1<ResPacket>(ResScalar(0));
|
|
247
|
-
for(Index j=j2; j<jend; j+=1)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
226
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
227
|
+
RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
|
|
228
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
251
229
|
}
|
|
252
|
-
pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
|
|
253
|
-
i+=ResPacketSize;
|
|
230
|
+
pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
|
|
231
|
+
i += ResPacketSize;
|
|
254
232
|
}
|
|
255
|
-
if(HasHalf && i<n_half)
|
|
256
|
-
{
|
|
233
|
+
if (HasHalf && i < n_half) {
|
|
257
234
|
ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
|
|
258
|
-
for(Index j=j2; j<jend; j+=1)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
|
|
235
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
236
|
+
RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
|
|
237
|
+
c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
|
|
262
238
|
}
|
|
263
|
-
pstoreu(res+i+ResPacketSizeHalf*0,
|
|
264
|
-
|
|
239
|
+
pstoreu(res + i + ResPacketSizeHalf * 0,
|
|
240
|
+
pmadd(c0, palpha_half, ploadu<ResPacketHalf>(res + i + ResPacketSizeHalf * 0)));
|
|
241
|
+
i += ResPacketSizeHalf;
|
|
265
242
|
}
|
|
266
|
-
if(HasQuarter && i<n_quarter)
|
|
267
|
-
{
|
|
243
|
+
if (HasQuarter && i < n_quarter) {
|
|
268
244
|
ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
|
|
269
|
-
for(Index j=j2; j<jend; j+=1)
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
|
|
245
|
+
for (Index j = j2; j < jend; j += 1) {
|
|
246
|
+
RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
|
|
247
|
+
c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
|
|
273
248
|
}
|
|
274
|
-
pstoreu(res+i+ResPacketSizeQuarter*0,
|
|
275
|
-
|
|
249
|
+
pstoreu(res + i + ResPacketSizeQuarter * 0,
|
|
250
|
+
pmadd(c0, palpha_quarter, ploadu<ResPacketQuarter>(res + i + ResPacketSizeQuarter * 0)));
|
|
251
|
+
i += ResPacketSizeQuarter;
|
|
276
252
|
}
|
|
277
|
-
for(;i<rows
|
|
278
|
-
{
|
|
253
|
+
for (; i < rows; ++i) {
|
|
279
254
|
ResScalar c0(0);
|
|
280
|
-
for(Index j=j2; j<jend; j+=1)
|
|
281
|
-
|
|
282
|
-
res[i] += alpha*c0;
|
|
255
|
+
for (Index j = j2; j < jend; j += 1) c0 += cj.pmul(lhs(i, j), rhs(j, 0));
|
|
256
|
+
res[i] += alpha * c0;
|
|
283
257
|
}
|
|
284
258
|
}
|
|
285
259
|
}
|
|
@@ -294,12 +268,13 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
|
|
|
294
268
|
* - alpha is always a complex (or converted to a complex)
|
|
295
269
|
* - no vectorization
|
|
296
270
|
*/
|
|
297
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
typedef gemv_traits<LhsScalar,RhsScalar
|
|
302
|
-
typedef gemv_traits<LhsScalar,RhsScalar,
|
|
271
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
272
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
273
|
+
struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,
|
|
274
|
+
ConjugateRhs, Version> {
|
|
275
|
+
typedef gemv_traits<LhsScalar, RhsScalar> Traits;
|
|
276
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
|
|
277
|
+
typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
|
|
303
278
|
|
|
304
279
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
|
305
280
|
|
|
@@ -315,75 +290,69 @@ struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugat
|
|
|
315
290
|
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
|
|
316
291
|
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
|
|
317
292
|
|
|
318
|
-
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
const RhsMapper& rhs,
|
|
322
|
-
ResScalar* res, Index resIncr,
|
|
323
|
-
ResScalar alpha);
|
|
293
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
|
|
294
|
+
const RhsMapper& rhs, ResScalar* res, Index resIncr,
|
|
295
|
+
ResScalar alpha);
|
|
324
296
|
};
|
|
325
297
|
|
|
326
|
-
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
ResScalar alpha)
|
|
333
|
-
{
|
|
298
|
+
template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
|
|
299
|
+
typename RhsMapper, bool ConjugateRhs, int Version>
|
|
300
|
+
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
|
|
301
|
+
general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
|
|
302
|
+
Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
|
|
303
|
+
ResScalar* res, Index resIncr, ResScalar alpha) {
|
|
334
304
|
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
335
|
-
// This helps GCC to generate
|
|
305
|
+
// This helps GCC to generate proper code.
|
|
336
306
|
LhsMapper lhs(alhs);
|
|
337
307
|
|
|
338
|
-
eigen_internal_assert(rhs.stride()==1);
|
|
339
|
-
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
|
340
|
-
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
|
341
|
-
conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
|
|
342
|
-
conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
|
|
308
|
+
eigen_internal_assert(rhs.stride() == 1);
|
|
309
|
+
conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
|
|
310
|
+
conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
|
|
311
|
+
conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
|
|
312
|
+
conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
|
|
343
313
|
|
|
344
314
|
// TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
|
|
345
315
|
// processing 8 rows at once might be counter productive wrt cache.
|
|
346
|
-
const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
|
|
347
|
-
const Index n4 = rows-3;
|
|
348
|
-
const Index n2 = rows-1;
|
|
316
|
+
const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
|
|
317
|
+
const Index n4 = rows - 3;
|
|
318
|
+
const Index n2 = rows - 1;
|
|
349
319
|
|
|
350
320
|
// TODO: for padded aligned inputs, we could enable aligned reads
|
|
351
|
-
enum {
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
321
|
+
enum {
|
|
322
|
+
LhsAlignment = Unaligned,
|
|
323
|
+
ResPacketSize = Traits::ResPacketSize,
|
|
324
|
+
ResPacketSizeHalf = HalfTraits::ResPacketSize,
|
|
325
|
+
ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
|
|
326
|
+
LhsPacketSize = Traits::LhsPacketSize,
|
|
327
|
+
LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
|
|
328
|
+
LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
|
|
329
|
+
HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
|
|
330
|
+
HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
|
|
360
331
|
};
|
|
361
332
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
c7 = pset1<ResPacket>(ResScalar(0));
|
|
373
|
-
|
|
374
|
-
Index j=0;
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
|
|
386
|
-
c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
|
|
333
|
+
using UnsignedIndex = typename make_unsigned<Index>::type;
|
|
334
|
+
const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
|
|
335
|
+
const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
|
|
336
|
+
const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
|
|
337
|
+
|
|
338
|
+
Index i = 0;
|
|
339
|
+
for (; i < n8; i += 8) {
|
|
340
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
341
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
|
|
342
|
+
c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
|
|
343
|
+
c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
|
|
344
|
+
|
|
345
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
346
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
347
|
+
|
|
348
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
349
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
|
|
350
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
|
|
351
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
|
|
352
|
+
c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 4, j), b0, c4);
|
|
353
|
+
c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 5, j), b0, c5);
|
|
354
|
+
c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 6, j), b0, c6);
|
|
355
|
+
c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 7, j), b0, c7);
|
|
387
356
|
}
|
|
388
357
|
ResScalar cc0 = predux(c0);
|
|
389
358
|
ResScalar cc1 = predux(c1);
|
|
@@ -393,126 +362,112 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
|
|
|
393
362
|
ResScalar cc5 = predux(c5);
|
|
394
363
|
ResScalar cc6 = predux(c6);
|
|
395
364
|
ResScalar cc7 = predux(c7);
|
|
396
|
-
|
|
397
|
-
{
|
|
398
|
-
RhsScalar b0 = rhs(j,0);
|
|
399
|
-
|
|
400
|
-
cc0 += cj.pmul(lhs(i+0,j), b0);
|
|
401
|
-
cc1 += cj.pmul(lhs(i+1,j), b0);
|
|
402
|
-
cc2 += cj.pmul(lhs(i+2,j), b0);
|
|
403
|
-
cc3 += cj.pmul(lhs(i+3,j), b0);
|
|
404
|
-
cc4 += cj.pmul(lhs(i+4,j), b0);
|
|
405
|
-
cc5 += cj.pmul(lhs(i+5,j), b0);
|
|
406
|
-
cc6 += cj.pmul(lhs(i+6,j), b0);
|
|
407
|
-
cc7 += cj.pmul(lhs(i+7,j), b0);
|
|
365
|
+
|
|
366
|
+
for (Index j = fullColBlockEnd; j < cols; ++j) {
|
|
367
|
+
RhsScalar b0 = rhs(j, 0);
|
|
368
|
+
|
|
369
|
+
cc0 += cj.pmul(lhs(i + 0, j), b0);
|
|
370
|
+
cc1 += cj.pmul(lhs(i + 1, j), b0);
|
|
371
|
+
cc2 += cj.pmul(lhs(i + 2, j), b0);
|
|
372
|
+
cc3 += cj.pmul(lhs(i + 3, j), b0);
|
|
373
|
+
cc4 += cj.pmul(lhs(i + 4, j), b0);
|
|
374
|
+
cc5 += cj.pmul(lhs(i + 5, j), b0);
|
|
375
|
+
cc6 += cj.pmul(lhs(i + 6, j), b0);
|
|
376
|
+
cc7 += cj.pmul(lhs(i + 7, j), b0);
|
|
408
377
|
}
|
|
409
|
-
res[(i+0)*resIncr] += alpha*cc0;
|
|
410
|
-
res[(i+1)*resIncr] += alpha*cc1;
|
|
411
|
-
res[(i+2)*resIncr] += alpha*cc2;
|
|
412
|
-
res[(i+3)*resIncr] += alpha*cc3;
|
|
413
|
-
res[(i+4)*resIncr] += alpha*cc4;
|
|
414
|
-
res[(i+5)*resIncr] += alpha*cc5;
|
|
415
|
-
res[(i+6)*resIncr] += alpha*cc6;
|
|
416
|
-
res[(i+7)*resIncr] += alpha*cc7;
|
|
378
|
+
res[(i + 0) * resIncr] += alpha * cc0;
|
|
379
|
+
res[(i + 1) * resIncr] += alpha * cc1;
|
|
380
|
+
res[(i + 2) * resIncr] += alpha * cc2;
|
|
381
|
+
res[(i + 3) * resIncr] += alpha * cc3;
|
|
382
|
+
res[(i + 4) * resIncr] += alpha * cc4;
|
|
383
|
+
res[(i + 5) * resIncr] += alpha * cc5;
|
|
384
|
+
res[(i + 6) * resIncr] += alpha * cc6;
|
|
385
|
+
res[(i + 7) * resIncr] += alpha * cc7;
|
|
417
386
|
}
|
|
418
|
-
for(; i<n4; i+=4)
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
431
|
-
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
|
|
432
|
-
c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
|
|
433
|
-
c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
|
|
387
|
+
for (; i < n4; i += 4) {
|
|
388
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
|
|
389
|
+
c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
|
|
390
|
+
|
|
391
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
392
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
393
|
+
|
|
394
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
395
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
|
|
396
|
+
c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
|
|
397
|
+
c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
|
|
434
398
|
}
|
|
435
399
|
ResScalar cc0 = predux(c0);
|
|
436
400
|
ResScalar cc1 = predux(c1);
|
|
437
401
|
ResScalar cc2 = predux(c2);
|
|
438
402
|
ResScalar cc3 = predux(c3);
|
|
439
|
-
|
|
440
|
-
{
|
|
441
|
-
RhsScalar b0 = rhs(j,0);
|
|
442
|
-
|
|
443
|
-
cc0 += cj.pmul(lhs(i+0,j), b0);
|
|
444
|
-
cc1 += cj.pmul(lhs(i+1,j), b0);
|
|
445
|
-
cc2 += cj.pmul(lhs(i+2,j), b0);
|
|
446
|
-
cc3 += cj.pmul(lhs(i+3,j), b0);
|
|
403
|
+
|
|
404
|
+
for (Index j = fullColBlockEnd; j < cols; ++j) {
|
|
405
|
+
RhsScalar b0 = rhs(j, 0);
|
|
406
|
+
|
|
407
|
+
cc0 += cj.pmul(lhs(i + 0, j), b0);
|
|
408
|
+
cc1 += cj.pmul(lhs(i + 1, j), b0);
|
|
409
|
+
cc2 += cj.pmul(lhs(i + 2, j), b0);
|
|
410
|
+
cc3 += cj.pmul(lhs(i + 3, j), b0);
|
|
447
411
|
}
|
|
448
|
-
res[(i+0)*resIncr] += alpha*cc0;
|
|
449
|
-
res[(i+1)*resIncr] += alpha*cc1;
|
|
450
|
-
res[(i+2)*resIncr] += alpha*cc2;
|
|
451
|
-
res[(i+3)*resIncr] += alpha*cc3;
|
|
412
|
+
res[(i + 0) * resIncr] += alpha * cc0;
|
|
413
|
+
res[(i + 1) * resIncr] += alpha * cc1;
|
|
414
|
+
res[(i + 2) * resIncr] += alpha * cc2;
|
|
415
|
+
res[(i + 3) * resIncr] += alpha * cc3;
|
|
452
416
|
}
|
|
453
|
-
for(; i<n2; i+=2)
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
|
|
462
|
-
|
|
463
|
-
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
|
|
464
|
-
c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
|
|
417
|
+
for (; i < n2; i += 2) {
|
|
418
|
+
ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
|
|
419
|
+
|
|
420
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
421
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
422
|
+
|
|
423
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
|
|
424
|
+
c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
|
|
465
425
|
}
|
|
466
426
|
ResScalar cc0 = predux(c0);
|
|
467
427
|
ResScalar cc1 = predux(c1);
|
|
468
|
-
for(; j<cols; ++j)
|
|
469
|
-
{
|
|
470
|
-
RhsScalar b0 = rhs(j,0);
|
|
471
428
|
|
|
472
|
-
|
|
473
|
-
|
|
429
|
+
for (Index j = fullColBlockEnd; j < cols; ++j) {
|
|
430
|
+
RhsScalar b0 = rhs(j, 0);
|
|
431
|
+
|
|
432
|
+
cc0 += cj.pmul(lhs(i + 0, j), b0);
|
|
433
|
+
cc1 += cj.pmul(lhs(i + 1, j), b0);
|
|
474
434
|
}
|
|
475
|
-
res[(i+0)*resIncr] += alpha*cc0;
|
|
476
|
-
res[(i+1)*resIncr] += alpha*cc1;
|
|
435
|
+
res[(i + 0) * resIncr] += alpha * cc0;
|
|
436
|
+
res[(i + 1) * resIncr] += alpha * cc1;
|
|
477
437
|
}
|
|
478
|
-
for(; i<rows; ++i)
|
|
479
|
-
{
|
|
438
|
+
for (; i < rows; ++i) {
|
|
480
439
|
ResPacket c0 = pset1<ResPacket>(ResScalar(0));
|
|
481
440
|
ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
|
|
482
441
|
ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
|
|
483
|
-
|
|
484
|
-
for(; j
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
|
|
442
|
+
|
|
443
|
+
for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
|
|
444
|
+
RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
|
|
445
|
+
c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i, j), b0, c0);
|
|
488
446
|
}
|
|
489
447
|
ResScalar cc0 = predux(c0);
|
|
490
448
|
if (HasHalf) {
|
|
491
|
-
for(; j
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
}
|
|
449
|
+
for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
|
|
450
|
+
RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
|
|
451
|
+
c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i, j), b0, c0_h);
|
|
452
|
+
}
|
|
496
453
|
cc0 += predux(c0_h);
|
|
497
454
|
}
|
|
498
455
|
if (HasQuarter) {
|
|
499
|
-
for(; j
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
}
|
|
456
|
+
for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
|
|
457
|
+
RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
|
|
458
|
+
c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i, j), b0, c0_q);
|
|
459
|
+
}
|
|
504
460
|
cc0 += predux(c0_q);
|
|
505
461
|
}
|
|
506
|
-
for(; j<cols; ++j)
|
|
507
|
-
|
|
508
|
-
cc0 += cj.pmul(lhs(i,j), rhs(j,0));
|
|
462
|
+
for (Index j = quarterColBlockEnd; j < cols; ++j) {
|
|
463
|
+
cc0 += cj.pmul(lhs(i, j), rhs(j, 0));
|
|
509
464
|
}
|
|
510
|
-
res[i*resIncr] += alpha*cc0;
|
|
465
|
+
res[i * resIncr] += alpha * cc0;
|
|
511
466
|
}
|
|
512
467
|
}
|
|
513
468
|
|
|
514
|
-
}
|
|
469
|
+
} // end namespace internal
|
|
515
470
|
|
|
516
|
-
}
|
|
471
|
+
} // end namespace Eigen
|
|
517
472
|
|
|
518
|
-
#endif
|
|
473
|
+
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|