@smake/eigen 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/COPYING.APACHE +203 -0
- package/eigen/COPYING.BSD +1 -1
- package/eigen/COPYING.MINPACK +51 -52
- package/eigen/Eigen/Cholesky +0 -1
- package/eigen/Eigen/Core +108 -266
- package/eigen/Eigen/Eigenvalues +0 -1
- package/eigen/Eigen/Geometry +3 -6
- package/eigen/Eigen/Householder +0 -1
- package/eigen/Eigen/Jacobi +0 -1
- package/eigen/Eigen/KLUSupport +41 -0
- package/eigen/Eigen/LU +2 -5
- package/eigen/Eigen/OrderingMethods +0 -3
- package/eigen/Eigen/PaStiXSupport +1 -0
- package/eigen/Eigen/PardisoSupport +0 -0
- package/eigen/Eigen/QR +0 -1
- package/eigen/Eigen/QtAlignedMalloc +0 -1
- package/eigen/Eigen/SVD +0 -1
- package/eigen/Eigen/Sparse +0 -2
- package/eigen/Eigen/SparseCholesky +0 -8
- package/eigen/Eigen/SparseLU +4 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
- package/eigen/Eigen/src/Cholesky/LLT.h +39 -23
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
- package/eigen/Eigen/src/Core/Array.h +99 -11
- package/eigen/Eigen/src/Core/ArrayBase.h +1 -1
- package/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
- package/eigen/Eigen/src/Core/Assign.h +1 -1
- package/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
- package/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
- package/eigen/Eigen/src/Core/BandMatrix.h +16 -16
- package/eigen/Eigen/src/Core/Block.h +56 -60
- package/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
- package/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
- package/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
- package/eigen/Eigen/src/Core/CoreIterators.h +5 -0
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +12 -10
- package/eigen/Eigen/src/Core/DenseBase.h +128 -39
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
- package/eigen/Eigen/src/Core/DenseStorage.h +150 -68
- package/eigen/Eigen/src/Core/Diagonal.h +21 -23
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
- package/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
- package/eigen/Eigen/src/Core/Dot.h +10 -10
- package/eigen/Eigen/src/Core/EigenBase.h +10 -9
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
- package/eigen/Eigen/src/Core/Fuzzy.h +3 -3
- package/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
- package/eigen/Eigen/src/Core/GenericPacketMath.h +597 -147
- package/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
- package/eigen/Eigen/src/Core/IO.h +40 -7
- package/eigen/Eigen/src/Core/IndexedView.h +237 -0
- package/eigen/Eigen/src/Core/Inverse.h +9 -10
- package/eigen/Eigen/src/Core/Map.h +7 -7
- package/eigen/Eigen/src/Core/MapBase.h +5 -3
- package/eigen/Eigen/src/Core/MathFunctions.h +756 -120
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
- package/eigen/Eigen/src/Core/Matrix.h +131 -25
- package/eigen/Eigen/src/Core/MatrixBase.h +19 -2
- package/eigen/Eigen/src/Core/NestByValue.h +25 -50
- package/eigen/Eigen/src/Core/NoAlias.h +4 -3
- package/eigen/Eigen/src/Core/NumTraits.h +107 -20
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +3 -3
- package/eigen/Eigen/src/Core/PlainObjectBase.h +145 -54
- package/eigen/Eigen/src/Core/Product.h +30 -25
- package/eigen/Eigen/src/Core/ProductEvaluators.h +183 -142
- package/eigen/Eigen/src/Core/Random.h +37 -1
- package/eigen/Eigen/src/Core/Redux.h +180 -170
- package/eigen/Eigen/src/Core/Ref.h +118 -21
- package/eigen/Eigen/src/Core/Replicate.h +8 -8
- package/eigen/Eigen/src/Core/Reshaped.h +454 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
- package/eigen/Eigen/src/Core/Reverse.h +18 -12
- package/eigen/Eigen/src/Core/Select.h +8 -6
- package/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
- package/eigen/Eigen/src/Core/Solve.h +14 -14
- package/eigen/Eigen/src/Core/SolveTriangular.h +13 -13
- package/eigen/Eigen/src/Core/SolverBase.h +41 -3
- package/eigen/Eigen/src/Core/StableNorm.h +100 -70
- package/eigen/Eigen/src/Core/StlIterators.h +463 -0
- package/eigen/Eigen/src/Core/Stride.h +9 -4
- package/eigen/Eigen/src/Core/Swap.h +5 -4
- package/eigen/Eigen/src/Core/Transpose.h +86 -27
- package/eigen/Eigen/src/Core/Transpositions.h +26 -8
- package/eigen/Eigen/src/Core/TriangularMatrix.h +88 -72
- package/eigen/Eigen/src/Core/VectorBlock.h +5 -5
- package/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
- package/eigen/Eigen/src/Core/Visitor.h +137 -29
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +186 -213
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1250 -252
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
- package/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +354 -15
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1073 -585
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +29 -7
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +4 -4
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +1 -1
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
- package/eigen/Eigen/src/Core/products/Parallelizer.h +23 -9
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +23 -6
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +2 -2
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +3 -3
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +5 -3
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
- package/eigen/Eigen/src/Core/util/BlasUtil.h +208 -124
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
- package/eigen/Eigen/src/Core/util/Constants.h +25 -9
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +14 -2
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +28 -4
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
- package/eigen/Eigen/src/Core/util/Macros.h +661 -250
- package/eigen/Eigen/src/Core/util/Memory.h +222 -52
- package/eigen/Eigen/src/Core/util/Meta.h +349 -105
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +48 -30
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +10 -5
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +75 -42
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
- package/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
- package/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
- package/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
- package/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
- package/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
- package/eigen/Eigen/src/Geometry/Quaternion.h +52 -14
- package/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
- package/eigen/Eigen/src/Geometry/Scaling.h +22 -4
- package/eigen/Eigen/src/Geometry/Transform.h +86 -65
- package/eigen/Eigen/src/Geometry/Translation.h +6 -6
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
- package/eigen/Eigen/src/Householder/Householder.h +8 -4
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
- package/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
- package/eigen/Eigen/src/LU/Determinant.h +35 -19
- package/eigen/Eigen/src/LU/FullPivLU.h +29 -43
- package/eigen/Eigen/src/LU/InverseImpl.h +25 -8
- package/eigen/Eigen/src/LU/PartialPivLU.h +67 -57
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
- package/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +10 -9
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
- package/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
- package/eigen/Eigen/src/SVD/BDCSVD.h +137 -48
- package/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
- package/eigen/Eigen/src/SVD/SVDBase.h +82 -21
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +16 -8
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +11 -36
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +2 -2
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +124 -10
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
- package/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
- package/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU.h +160 -10
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
- package/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
- package/eigen/Eigen/src/StlSupport/StdDeque.h +2 -14
- package/eigen/Eigen/src/StlSupport/StdList.h +2 -2
- package/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
- package/eigen/Eigen/src/misc/lapacke.h +5 -4
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +27 -1
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
- package/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
- package/eigen/README.md +2 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -10,6 +10,10 @@
|
|
|
10
10
|
#ifndef EIGEN_PARALLELIZER_H
|
|
11
11
|
#define EIGEN_PARALLELIZER_H
|
|
12
12
|
|
|
13
|
+
#if EIGEN_HAS_CXX11_ATOMIC
|
|
14
|
+
#include <atomic>
|
|
15
|
+
#endif
|
|
16
|
+
|
|
13
17
|
namespace Eigen {
|
|
14
18
|
|
|
15
19
|
namespace internal {
|
|
@@ -18,7 +22,7 @@ namespace internal {
|
|
|
18
22
|
inline void manage_multi_threading(Action action, int* v)
|
|
19
23
|
{
|
|
20
24
|
static int m_maxThreads = -1;
|
|
21
|
-
EIGEN_UNUSED_VARIABLE(m_maxThreads)
|
|
25
|
+
EIGEN_UNUSED_VARIABLE(m_maxThreads)
|
|
22
26
|
|
|
23
27
|
if(action==SetAction)
|
|
24
28
|
{
|
|
@@ -76,8 +80,17 @@ template<typename Index> struct GemmParallelInfo
|
|
|
76
80
|
{
|
|
77
81
|
GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
|
|
78
82
|
|
|
83
|
+
// volatile is not enough on all architectures (see bug 1572)
|
|
84
|
+
// to guarantee that when thread A says to thread B that it is
|
|
85
|
+
// done with packing a block, then all writes have been really
|
|
86
|
+
// carried out... C++11 memory model+atomic guarantees this.
|
|
87
|
+
#if EIGEN_HAS_CXX11_ATOMIC
|
|
88
|
+
std::atomic<Index> sync;
|
|
89
|
+
std::atomic<int> users;
|
|
90
|
+
#else
|
|
79
91
|
Index volatile sync;
|
|
80
92
|
int volatile users;
|
|
93
|
+
#endif
|
|
81
94
|
|
|
82
95
|
Index lhs_start;
|
|
83
96
|
Index lhs_length;
|
|
@@ -88,11 +101,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
|
|
|
88
101
|
{
|
|
89
102
|
// TODO when EIGEN_USE_BLAS is defined,
|
|
90
103
|
// we should still enable OMP for other scalar types
|
|
91
|
-
|
|
104
|
+
// Without C++11, we have to disable GEMM's parallelization on
|
|
105
|
+
// non x86 architectures because there volatile is not enough for our purpose.
|
|
106
|
+
// See bug 1572.
|
|
107
|
+
#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
|
|
92
108
|
// FIXME the transpose variable is only needed to properly split
|
|
93
109
|
// the matrix product when multithreading is enabled. This is a temporary
|
|
94
110
|
// fix to support row-major destination matrices. This whole
|
|
95
|
-
// parallelizer mechanism has to be
|
|
111
|
+
// parallelizer mechanism has to be redesigned anyway.
|
|
96
112
|
EIGEN_UNUSED_VARIABLE(depth);
|
|
97
113
|
EIGEN_UNUSED_VARIABLE(transpose);
|
|
98
114
|
func(0,rows, 0,cols);
|
|
@@ -113,12 +129,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
|
|
|
113
129
|
double work = static_cast<double>(rows) * static_cast<double>(cols) *
|
|
114
130
|
static_cast<double>(depth);
|
|
115
131
|
double kMinTaskSize = 50000; // FIXME improve this heuristic.
|
|
116
|
-
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
|
|
132
|
+
pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
|
|
117
133
|
|
|
118
134
|
// compute the number of threads we are going to use
|
|
119
135
|
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
|
|
120
136
|
|
|
121
|
-
// if multi-threading is
|
|
137
|
+
// if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
|
|
122
138
|
// then abort multi-threading
|
|
123
139
|
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
|
|
124
140
|
if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
|
|
@@ -151,10 +167,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
|
|
|
151
167
|
info[i].lhs_start = r0;
|
|
152
168
|
info[i].lhs_length = actualBlockRows;
|
|
153
169
|
|
|
154
|
-
if(transpose)
|
|
155
|
-
|
|
156
|
-
else
|
|
157
|
-
func(0, rows, c0, actualBlockCols, info);
|
|
170
|
+
if(transpose) func(c0, actualBlockCols, 0, rows, info);
|
|
171
|
+
else func(0, rows, c0, actualBlockCols, info);
|
|
158
172
|
}
|
|
159
173
|
#endif
|
|
160
174
|
}
|
|
@@ -45,14 +45,23 @@ struct symm_pack_lhs
|
|
|
45
45
|
}
|
|
46
46
|
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
|
|
47
47
|
{
|
|
48
|
-
|
|
48
|
+
typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
|
|
49
|
+
typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
|
|
50
|
+
enum { PacketSize = packet_traits<Scalar>::size,
|
|
51
|
+
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
|
52
|
+
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
|
|
53
|
+
HasHalf = (int)HalfPacketSize < (int)PacketSize,
|
|
54
|
+
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
|
|
55
|
+
|
|
49
56
|
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
|
|
50
57
|
Index count = 0;
|
|
51
58
|
//Index peeled_mc3 = (rows/Pack1)*Pack1;
|
|
52
59
|
|
|
53
60
|
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
|
54
61
|
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
|
55
|
-
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
|
62
|
+
const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
|
|
63
|
+
const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
|
|
64
|
+
const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
|
|
56
65
|
|
|
57
66
|
if(Pack1>=3*PacketSize)
|
|
58
67
|
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
|
|
@@ -66,8 +75,16 @@ struct symm_pack_lhs
|
|
|
66
75
|
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
|
|
67
76
|
pack<1*PacketSize>(blockA, lhs, cols, i, count);
|
|
68
77
|
|
|
78
|
+
if(HasHalf && Pack1>=HalfPacketSize)
|
|
79
|
+
for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
|
|
80
|
+
pack<HalfPacketSize>(blockA, lhs, cols, i, count);
|
|
81
|
+
|
|
82
|
+
if(HasQuarter && Pack1>=QuarterPacketSize)
|
|
83
|
+
for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
|
|
84
|
+
pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
|
|
85
|
+
|
|
69
86
|
// do the same with mr==1
|
|
70
|
-
for(Index i=
|
|
87
|
+
for(Index i=peeled_mc_quarter; i<rows; i++)
|
|
71
88
|
{
|
|
72
89
|
for(Index k=0; k<i; k++)
|
|
73
90
|
blockA[count++] = lhs(i, k); // normal
|
|
@@ -355,7 +372,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
|
|
355
372
|
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
|
356
373
|
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
|
357
374
|
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
|
358
|
-
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
|
375
|
+
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
|
359
376
|
|
|
360
377
|
for(Index k2=0; k2<size; k2+=kc)
|
|
361
378
|
{
|
|
@@ -390,7 +407,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
|
|
390
407
|
for(Index i2=k2+kc; i2<size; i2+=mc)
|
|
391
408
|
{
|
|
392
409
|
const Index actual_mc = (std::min)(i2+mc,size)-i2;
|
|
393
|
-
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
|
|
410
|
+
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
|
|
394
411
|
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
|
395
412
|
|
|
396
413
|
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
|
@@ -442,7 +459,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
|
|
442
459
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
|
443
460
|
|
|
444
461
|
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
|
445
|
-
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
|
462
|
+
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
|
|
446
463
|
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
|
447
464
|
|
|
448
465
|
for(Index k2=0; k2<size; k2+=kc)
|
|
@@ -15,7 +15,7 @@ namespace Eigen {
|
|
|
15
15
|
namespace internal {
|
|
16
16
|
|
|
17
17
|
/* Optimized selfadjoint matrix * vector product:
|
|
18
|
-
* This algorithm processes 2 columns at
|
|
18
|
+
* This algorithm processes 2 columns at once that allows to both reduce
|
|
19
19
|
* the number of load/stores of the result by a factor 2 and to reduce
|
|
20
20
|
* the instruction dependency.
|
|
21
21
|
*/
|
|
@@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
|
|
|
27
27
|
struct selfadjoint_matrix_vector_product
|
|
28
28
|
|
|
29
29
|
{
|
|
30
|
-
static EIGEN_DONT_INLINE
|
|
30
|
+
static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
|
|
31
|
+
void run(
|
|
31
32
|
Index size,
|
|
32
33
|
const Scalar* lhs, Index lhsStride,
|
|
33
34
|
const Scalar* rhs,
|
|
@@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
|
|
|
36
37
|
};
|
|
37
38
|
|
|
38
39
|
template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
|
|
39
|
-
EIGEN_DONT_INLINE
|
|
40
|
+
EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
|
|
41
|
+
void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
|
|
40
42
|
Index size,
|
|
41
43
|
const Scalar* lhs, Index lhsStride,
|
|
42
44
|
const Scalar* rhs,
|
|
@@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|
|
62
64
|
|
|
63
65
|
Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
|
|
64
66
|
|
|
65
|
-
|
|
66
|
-
Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
|
|
67
|
+
Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
|
|
67
68
|
if (FirstTriangular)
|
|
68
69
|
bound = size - bound;
|
|
69
70
|
|
|
@@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
|
|
|
175
176
|
enum { LhsUpLo = LhsMode&(Upper|Lower) };
|
|
176
177
|
|
|
177
178
|
template<typename Dest>
|
|
178
|
-
static
|
|
179
|
+
static EIGEN_DEVICE_FUNC
|
|
180
|
+
void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
|
|
179
181
|
{
|
|
180
182
|
typedef typename Dest::Scalar ResScalar;
|
|
181
183
|
typedef typename Rhs::Scalar RhsScalar;
|
|
@@ -111,7 +111,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
|
|
|
111
111
|
Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
|
|
112
112
|
IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
|
|
113
113
|
::run(size, depth,
|
|
114
|
-
|
|
114
|
+
actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(),
|
|
115
115
|
mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
|
|
116
116
|
}
|
|
117
117
|
};
|
|
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
|
|
|
120
120
|
|
|
121
121
|
template<typename MatrixType, unsigned int UpLo>
|
|
122
122
|
template<typename DerivedU>
|
|
123
|
-
SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
|
123
|
+
EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
|
124
124
|
::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
|
|
125
125
|
{
|
|
126
126
|
selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
|
|
@@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
|
|
|
24
24
|
template<typename Scalar, typename Index, typename UType, typename VType>
|
|
25
25
|
struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
|
|
26
26
|
{
|
|
27
|
-
static
|
|
27
|
+
static EIGEN_DEVICE_FUNC
|
|
28
|
+
void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
|
|
28
29
|
{
|
|
29
30
|
const Index size = u.size();
|
|
30
31
|
for (Index i=0; i<size; ++i)
|
|
@@ -57,7 +58,7 @@ template<bool Cond, typename T> struct conj_expr_if
|
|
|
57
58
|
|
|
58
59
|
template<typename MatrixType, unsigned int UpLo>
|
|
59
60
|
template<typename DerivedU, typename DerivedV>
|
|
60
|
-
SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
|
61
|
+
EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
|
61
62
|
::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
|
|
62
63
|
{
|
|
63
64
|
typedef internal::blas_traits<DerivedU> UBlasTraits;
|
|
@@ -79,8 +80,8 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
|
|
|
79
80
|
if (IsRowMajor)
|
|
80
81
|
actualAlpha = numext::conj(actualAlpha);
|
|
81
82
|
|
|
82
|
-
typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType;
|
|
83
|
-
typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType;
|
|
83
|
+
typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType;
|
|
84
|
+
typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType;
|
|
84
85
|
internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
|
|
85
86
|
(IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
|
|
86
87
|
::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
|
|
@@ -155,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|
|
155
155
|
triangularBuffer.diagonal().setOnes();
|
|
156
156
|
|
|
157
157
|
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
|
158
|
-
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
|
158
|
+
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
|
|
159
159
|
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
|
160
160
|
|
|
161
161
|
for(Index k2=IsLower ? depth : 0;
|
|
@@ -226,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|
|
226
226
|
for(Index i2=start; i2<end; i2+=mc)
|
|
227
227
|
{
|
|
228
228
|
const Index actual_mc = (std::min)(i2+mc,end)-i2;
|
|
229
|
-
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
|
229
|
+
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
|
|
230
230
|
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
|
|
231
231
|
|
|
232
232
|
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
|
|
@@ -305,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|
|
305
305
|
triangularBuffer.diagonal().setOnes();
|
|
306
306
|
|
|
307
307
|
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
|
308
|
-
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
|
308
|
+
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
|
|
309
309
|
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
|
310
310
|
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
|
311
311
|
|
|
@@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|
|
76
76
|
|
|
77
77
|
conj_if<Conjugate> conj;
|
|
78
78
|
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
|
79
|
-
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
|
|
79
|
+
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
|
|
80
80
|
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
|
|
81
81
|
|
|
82
82
|
// the goal here is to subdivise the Rhs panels such that we keep some cache
|
|
@@ -136,7 +136,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|
|
136
136
|
}
|
|
137
137
|
else
|
|
138
138
|
{
|
|
139
|
-
Scalar
|
|
139
|
+
Scalar& otherij = other(i,j);
|
|
140
|
+
otherij *= a;
|
|
141
|
+
Scalar b = otherij;
|
|
140
142
|
typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
|
|
141
143
|
typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
|
|
142
144
|
for (Index i3=0;i3<rs;++i3)
|
|
@@ -229,7 +231,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
|
|
229
231
|
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
|
230
232
|
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
|
231
233
|
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
|
|
232
|
-
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
|
|
234
|
+
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
|
|
233
235
|
|
|
234
236
|
for(Index k2=IsLower ? size : 0;
|
|
235
237
|
IsLower ? k2>0 : k2<size;
|
|
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
|
|
58
58
|
{
|
|
59
59
|
// let's directly call the low level product function because:
|
|
60
60
|
// 1 - it is faster to compile
|
|
61
|
-
// 2 - it is
|
|
61
|
+
// 2 - it is slightly faster at runtime
|
|
62
62
|
Index startRow = IsLower ? pi : pi-actualPanelWidth;
|
|
63
63
|
Index startCol = IsLower ? 0 : pi;
|
|
64
64
|
|
|
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
|
|
77
77
|
if (k>0)
|
|
78
78
|
rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
|
|
79
79
|
|
|
80
|
-
if(!(Mode & UnitDiag))
|
|
80
|
+
if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
|
|
81
81
|
rhs[i] /= cjLhs(i,i);
|
|
82
82
|
}
|
|
83
83
|
}
|
|
@@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
|
|
114
114
|
for(Index k=0; k<actualPanelWidth; ++k)
|
|
115
115
|
{
|
|
116
116
|
Index i = IsLower ? pi+k : pi-k-1;
|
|
117
|
-
if(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
117
|
+
if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
|
|
118
|
+
{
|
|
119
|
+
if(!(Mode & UnitDiag))
|
|
120
|
+
rhs[i] /= cjLhs.coeff(i,i);
|
|
121
|
+
|
|
122
|
+
Index r = actualPanelWidth - k - 1; // remaining size
|
|
123
|
+
Index s = IsLower ? i+1 : i-r;
|
|
124
|
+
if (r>0)
|
|
125
|
+
Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
|
|
126
|
+
}
|
|
124
127
|
}
|
|
125
128
|
Index r = IsLower ? size - endBlock : startBlock; // remaining size
|
|
126
129
|
if (r > 0)
|
|
127
130
|
{
|
|
128
131
|
// let's directly call the low level product function because:
|
|
129
132
|
// 1 - it is faster to compile
|
|
130
|
-
// 2 - it is
|
|
133
|
+
// 2 - it is slightly faster at runtime
|
|
131
134
|
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
|
|
132
135
|
r, actualPanelWidth,
|
|
133
136
|
LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
|