npm - @smake/eigen - Versions diffs - 1.0.2 → 1.1.1 - Mend

@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (435) hide show

package/README.md +1 -1
package/eigen/Eigen/AccelerateSupport +52 -0
package/eigen/Eigen/Cholesky +18 -21
package/eigen/Eigen/CholmodSupport +28 -28
package/eigen/Eigen/Core +235 -326
package/eigen/Eigen/Eigenvalues +16 -14
package/eigen/Eigen/Geometry +21 -24
package/eigen/Eigen/Householder +9 -8
package/eigen/Eigen/IterativeLinearSolvers +8 -4
package/eigen/Eigen/Jacobi +14 -14
package/eigen/Eigen/KLUSupport +43 -0
package/eigen/Eigen/LU +16 -20
package/eigen/Eigen/MetisSupport +12 -12
package/eigen/Eigen/OrderingMethods +54 -54
package/eigen/Eigen/PaStiXSupport +23 -20
package/eigen/Eigen/PardisoSupport +17 -14
package/eigen/Eigen/QR +18 -21
package/eigen/Eigen/QtAlignedMalloc +5 -13
package/eigen/Eigen/SPQRSupport +21 -14
package/eigen/Eigen/SVD +23 -18
package/eigen/Eigen/Sparse +1 -4
package/eigen/Eigen/SparseCholesky +18 -23
package/eigen/Eigen/SparseCore +18 -17
package/eigen/Eigen/SparseLU +12 -8
package/eigen/Eigen/SparseQR +16 -14
package/eigen/Eigen/StdDeque +5 -2
package/eigen/Eigen/StdList +5 -2
package/eigen/Eigen/StdVector +5 -2
package/eigen/Eigen/SuperLUSupport +30 -24
package/eigen/Eigen/ThreadPool +80 -0
package/eigen/Eigen/UmfPackSupport +19 -17
package/eigen/Eigen/Version +14 -0
package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
package/eigen/Eigen/src/Core/Array.h +341 -294
package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
package/eigen/Eigen/src/Core/Assign.h +30 -40
package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
package/eigen/Eigen/src/Core/Block.h +375 -398
package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
package/eigen/Eigen/src/Core/DenseBase.h +632 -571
package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
package/eigen/Eigen/src/Core/Diagonal.h +169 -210
package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
package/eigen/Eigen/src/Core/Dot.h +172 -222
package/eigen/Eigen/src/Core/EigenBase.h +75 -85
package/eigen/Eigen/src/Core/Fill.h +138 -0
package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
package/eigen/Eigen/src/Core/IO.h +147 -139
package/eigen/Eigen/src/Core/IndexedView.h +321 -0
package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/Inverse.h +56 -66
package/eigen/Eigen/src/Core/Map.h +124 -142
package/eigen/Eigen/src/Core/MapBase.h +256 -281
package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
package/eigen/Eigen/src/Core/Matrix.h +491 -416
package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
package/eigen/Eigen/src/Core/NestByValue.h +66 -85
package/eigen/Eigen/src/Core/NoAlias.h +79 -85
package/eigen/Eigen/src/Core/NumTraits.h +235 -148
package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
package/eigen/Eigen/src/Core/Product.h +260 -139
package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
package/eigen/Eigen/src/Core/Random.h +161 -136
package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
package/eigen/Eigen/src/Core/RealView.h +250 -0
package/eigen/Eigen/src/Core/Redux.h +366 -336
package/eigen/Eigen/src/Core/Ref.h +308 -209
package/eigen/Eigen/src/Core/Replicate.h +94 -106
package/eigen/Eigen/src/Core/Reshaped.h +398 -0
package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
package/eigen/Eigen/src/Core/Reverse.h +136 -145
package/eigen/Eigen/src/Core/Select.h +70 -140
package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
package/eigen/Eigen/src/Core/Solve.h +97 -111
package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
package/eigen/Eigen/src/Core/SolverBase.h +138 -101
package/eigen/Eigen/src/Core/StableNorm.h +156 -160
package/eigen/Eigen/src/Core/StlIterators.h +619 -0
package/eigen/Eigen/src/Core/Stride.h +91 -88
package/eigen/Eigen/src/Core/Swap.h +70 -38
package/eigen/Eigen/src/Core/Transpose.h +295 -273
package/eigen/Eigen/src/Core/Transpositions.h +272 -317
package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
package/eigen/Eigen/src/Core/Visitor.h +480 -216
package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
package/eigen/Eigen/src/Core/util/Assert.h +158 -0
package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
package/eigen/Eigen/src/Core/util/Constants.h +314 -263
package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
package/eigen/Eigen/src/Core/util/Macros.h +939 -646
package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
package/eigen/Eigen/src/Core/util/Meta.h +618 -426
package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
package/eigen/Eigen/src/Geometry/Transform.h +896 -953
package/eigen/Eigen/src/Geometry/Translation.h +100 -98
package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
package/eigen/Eigen/src/Householder/Householder.h +104 -122
package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
package/eigen/Eigen/src/LU/Determinant.h +60 -63
package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
package/eigen/Eigen/src/StlSupport/details.h +48 -50
package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
package/eigen/Eigen/src/misc/Image.h +41 -43
package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/misc/Kernel.h +39 -41
package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
package/eigen/Eigen/src/misc/blas.h +83 -426
package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
package/lib/LibEigen.d.ts +4 -0
package/lib/LibEigen.js +14 -0
package/lib/index.d.ts +1 -1
package/lib/index.js +7 -3
package/package.json +2 -10
package/eigen/Eigen/CMakeLists.txt +0 -19
package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
package/eigen/Eigen/src/misc/lapack.h +0 -152
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
package/lib/eigen.d.ts +0 -2
package/lib/eigen.js +0 -15

package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h CHANGED Viewed

@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,15 +10,61 @@
 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
 #define EIGEN_GENERAL_MATRIX_VECTOR_H
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 namespace Eigen {
 namespace internal {
+enum GEMVPacketSizeType { GEMVPacketFull = 0, GEMVPacketHalf, GEMVPacketQuarter };
+template <int N, typename T1, typename T2, typename T3>
+struct gemv_packet_cond {
+  typedef T3 type;
+};
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
+  typedef T1 type;
+};
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
+  typedef T2 type;
+};
+template <typename LhsScalar, typename RhsScalar, int PacketSize_ = GEMVPacketFull>
+class gemv_traits {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size)                                               \
+  typedef typename gemv_packet_cond<                                                                       \
+      packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
+      typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
+  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
+#undef PACKET_DECL_COND_POSTFIX
+ public:
+  enum {
+    Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable &&
+                   int(unpacket_traits<LhsPacket_>::size) == int(unpacket_traits<RhsPacket_>::size),
+    LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1
+  };
+  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
+  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
+  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
+};
 /* Optimized col-major matrix * vector product:
- * This algorithm processes 4 columns at onces that allows to both reduce
- * the number of load/stores of the result by a factor 4 and to reduce
- * the instruction dependency. Moreover, we know that all bands have the
- * same alignment pattern.
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
  *
  * Mixing type logic: C += alpha * A * B
  *  |  A  |  B  |alpha| comments
@@ -27,302 +73,193 @@ namespace internal {
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
  *
- * Accesses to the matrix coefficients follow the following logic:
- *
- * - if all columns have the same alignment then
- *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
- *   - otherwise perform unaligned loads only (-> NoneAligned case)
- * - otherwise
- *   - if even columns have the same alignment then
- *     // odd columns are guaranteed to have the same alignment too
- *     - if even or odd columns have the same alignment as the result, then
- *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
- *       - perform half aligned and half unaligned loads (-> EvenAligned case)
- *     - otherwise perform unaligned loads only (-> NoneAligned case)
- *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
- *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
- *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
- *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
- *   - otherwise,
- *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
- *     // we currently fall back to the NoneAligned case
- *
  * The same reasoning apply for the transposed case.
- *
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
- * compared to unaligned loads on a 4 byte boundary.
- *
  */
-template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
-{
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,
+                                     ConjugateRhs, Version> {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
-};
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-        ResScalar* res, Index resIncr,
-  RhsScalar alpha);
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
+                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                      RhsScalar alpha);
 };
-template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-        ResScalar* res, Index resIncr,
-  RhsScalar alpha)
-{
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                            ResScalar* res, Index resIncr, RhsScalar alpha) {
   EIGEN_UNUSED_VARIABLE(resIncr);
-  eigen_internal_assert(resIncr==1);
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
-    pstore(&res[j], \
-      padd(pload<ResPacket>(&res[j]), \
-        padd( \
-      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
-      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
-      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
-      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
-  typedef typename LhsMapper::VectorMapper LhsScalars;
-  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-  if(ConjugateRhs)
-    alpha = numext::conj(alpha);
-  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
-  const Index columnsAtOnce = 4;
-  const Index peels = 2;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index ResPacketAlignedMask = ResPacketSize-1;
-//  const Index PeelAlignedMask = ResPacketSize*peels-1;
-  const Index size = rows;
+  eigen_internal_assert(resIncr == 1);
-  const Index lhsStride = lhs.stride();
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_default_aligned(res,size);
-  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                       : FirstAligned;
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = lhs.firstAligned(size);
-  // find how many columns do we have to skip to be aligned with the result (if possible)
-  Index skipColumns = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
-  {
-    alignedSize = 0;
-    alignedStart = 0;
-    alignmentPattern = NoneAligned;
-  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    // Currently, it seems to be better to perform unaligned loads anyway
-    alignmentPattern = NoneAligned;
-  }
-  else if (LhsPacketSize>1)
-  {
-  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
-    while (skipColumns<LhsPacketSize &&
-          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
-      ++skipColumns;
-    if (skipColumns==LhsPacketSize)
-    {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipColumns = 0;
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    ResPacketSizeHalf = HalfTraits::ResPacketSize,
+    ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+    HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+  const Index n8 = rows - 8 * ResPacketSize + 1;
+  const Index n4 = rows - 4 * ResPacketSize + 1;
+  const Index n3 = rows - 3 * ResPacketSize + 1;
+  const Index n2 = rows - 2 * ResPacketSize + 1;
+  const Index n1 = rows - 1 * ResPacketSize + 1;
+  const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
+  const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
+  ResPacket palpha = pset1<ResPacket>(alpha);
+  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
+  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+    Index i = 0;
+    for (; i < n8; i += ResPacketSize * 8) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
+                c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
+                c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
+        c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
+        c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
+        c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
+        c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
+      }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
+      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
+      pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
+      pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
+      pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
+      pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
     }
-    else
-    {
-      skipColumns = (std::min)(skipColumns,cols);
-      // note that the skiped columns are processed later.
+    if (i < n4) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
+      }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
+      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
+      i += ResPacketSize * 4;
     }
+    if (i < n3) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
+      }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
-    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
-                      || (skipColumns + columnsAtOnce >= cols)
-                      || LhsPacketSize > size
-                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
-  }
-  else if(Vectorizable)
-  {
-    alignedStart = 0;
-    alignedSize = size;
-    alignmentPattern = AllAligned;
-  }
+      i += ResPacketSize * 3;
+    }
+    if (i < n2) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
-  const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
-  const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
-  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
-  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
-  {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
-    // this helps a lot generating better binary code
-    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
-                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
-    if (Vectorizable)
-    {
-      /* explicit vectorization */
-      // process initial unaligned coeffs
-      for (Index j=0; j<alignedStart; ++j)
-      {
-        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
       }
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if(peels>1)
-            {
-              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
-              ResPacket T0, T1;
-              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
-              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
-              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-              for (; j<peeledSize; j+=peels*ResPacketSize)
-              {
-                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
-                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
-                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
-                A00 = lhs0.template load<LhsPacket, Aligned>(j);
-                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
-                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
-                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
-                T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
-                T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
-                T0  = pcj.pmadd(A03, ptmp3, T0);
-                pstore(&res[j],T0);
-                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
-                T1  = pcj.pmadd(A11, ptmp1, T1);
-                T1  = pcj.pmadd(A12, ptmp2, T1);
-                T1  = pcj.pmadd(A13, ptmp3, T1);
-                pstore(&res[j+ResPacketSize],T1);
-              }
-            }
-            for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
-            break;
-        }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      i += ResPacketSize * 2;
+    }
+    if (i < n1) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
       }
-    } // end explicit vectorization
-    /* process remaining coeffs (or all if there is no explicit vectorization) */
-    for (Index j=alignedSize; j<size; ++j)
-    {
-      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      i += ResPacketSize;
     }
-  }
-  // process remaining first and last columns (at most columnsAtOnce-1)
-  Index end = cols;
-  Index start = columnBound;
-  do
-  {
-    for (Index k=start; k<end; ++k)
-    {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
-      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
-      if (Vectorizable)
-      {
-        /* explicit vectorization */
-        // process first unaligned result's coeffs
-        for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
-        // process aligned result's coeffs
-        if (lhs0.template aligned<LhsPacket>(alignedStart))
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
-        else
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+    if (HasHalf && i < n_half) {
+      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
+        c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
       }
-      // process remaining scalars (or all if no explicit vectorization)
-      for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
+      pstoreu(res + i + ResPacketSizeHalf * 0,
+              pmadd(c0, palpha_half, ploadu<ResPacketHalf>(res + i + ResPacketSizeHalf * 0)));
+      i += ResPacketSizeHalf;
+    }
+    if (HasQuarter && i < n_quarter) {
+      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
+        c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
+      }
+      pstoreu(res + i + ResPacketSizeQuarter * 0,
+              pmadd(c0, palpha_quarter, ploadu<ResPacketQuarter>(res + i + ResPacketSizeQuarter * 0)));
+      i += ResPacketSizeQuarter;
     }
-    if (skipColumns)
-    {
-      start = 0;
-      end = skipColumns;
-      skipColumns = 0;
+    for (; i < rows; ++i) {
+      ResScalar c0(0);
+      for (Index j = j2; j < jend; j += 1) c0 += cj.pmul(lhs(i, j), rhs(j, 0));
+      res[i] += alpha * c0;
     }
-    else
-      break;
-  } while(Vectorizable);
-  #undef _EIGEN_ACCUMULATE_PACKETS
+  }
 }
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at onces that allows to both reduce
+ * This algorithm processes 4 rows at once that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -331,289 +268,206 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
  *  - alpha is always a complex (or converted to a complex)
  *  - no vectorization
  */
-template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
-{
-typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
-};
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,
+                                     ConjugateRhs, Version> {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-        ResScalar* res, Index resIncr,
-  ResScalar alpha);
-};
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
-template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  ResScalar* res, Index resIncr,
-  ResScalar alpha)
-{
-  eigen_internal_assert(rhs.stride()==1);
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
-    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
-    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
-    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
-    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
-    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
-  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-  typedef typename LhsMapper::VectorMapper LhsScalars;
-  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
-  const Index rowsAtOnce = 4;
-  const Index peels = 2;
-  const Index RhsPacketAlignedMask = RhsPacketSize-1;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index depth = cols;
-  const Index lhsStride = lhs.stride();
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type
-  // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = rhs.firstAligned(depth);
-  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                           : FirstAligned;
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
-  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
-  // find how many rows do we have to skip to be aligned with rhs (if possible)
-  Index skipRows = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
-      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
-      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
-  {
-    alignedSize = 0;
-    alignedStart = 0;
-    alignmentPattern = NoneAligned;
-  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    alignmentPattern = NoneAligned;
-  }
-  else if (LhsPacketSize>1)
-  {
-  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
-    while (skipRows<LhsPacketSize &&
-           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
-      ++skipRows;
-    if (skipRows==LhsPacketSize)
-    {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipRows = 0;
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
+                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                      ResScalar alpha);
+};
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                            ResScalar* res, Index resIncr, ResScalar alpha) {
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  eigen_internal_assert(rhs.stride() == 1);
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    ResPacketSizeHalf = HalfTraits::ResPacketSize,
+    ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+    LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+    HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+    HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+  using UnsignedIndex = typename make_unsigned<Index>::type;
+  const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
+  const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
+  const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
+  Index i = 0;
+  for (; i < n8; i += 8) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
+              c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
+              c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
+      c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 4, j), b0, c4);
+      c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 5, j), b0, c5);
+      c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 6, j), b0, c6);
+      c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 7, j), b0, c7);
     }
-    else
-    {
-      skipRows = (std::min)(skipRows,Index(rows));
-      // note that the skiped columns are processed later.
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    ResScalar cc4 = predux(c4);
+    ResScalar cc5 = predux(c5);
+    ResScalar cc6 = predux(c6);
+    ResScalar cc7 = predux(c7);
+    for (Index j = fullColBlockEnd; j < cols; ++j) {
+      RhsScalar b0 = rhs(j, 0);
+      cc0 += cj.pmul(lhs(i + 0, j), b0);
+      cc1 += cj.pmul(lhs(i + 1, j), b0);
+      cc2 += cj.pmul(lhs(i + 2, j), b0);
+      cc3 += cj.pmul(lhs(i + 3, j), b0);
+      cc4 += cj.pmul(lhs(i + 4, j), b0);
+      cc5 += cj.pmul(lhs(i + 5, j), b0);
+      cc6 += cj.pmul(lhs(i + 6, j), b0);
+      cc7 += cj.pmul(lhs(i + 7, j), b0);
     }
-    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
-                      || LhsPacketSize==1
-                      || (skipRows + rowsAtOnce >= rows)
-                      || LhsPacketSize > depth
-                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
-  }
-  else if(Vectorizable)
-  {
-    alignedStart = 0;
-    alignedSize = depth;
-    alignmentPattern = AllAligned;
+    res[(i + 0) * resIncr] += alpha * cc0;
+    res[(i + 1) * resIncr] += alpha * cc1;
+    res[(i + 2) * resIncr] += alpha * cc2;
+    res[(i + 3) * resIncr] += alpha * cc3;
+    res[(i + 4) * resIncr] += alpha * cc4;
+    res[(i + 5) * resIncr] += alpha * cc5;
+    res[(i + 6) * resIncr] += alpha * cc6;
+    res[(i + 7) * resIncr] += alpha * cc7;
   }
+  for (; i < n4; i += 4) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
-  const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
-  const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
-  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
-  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
-  {
-    // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
-    EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
-    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
-    // this helps the compiler generating good binary code
-    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
-                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
-    if (Vectorizable)
-    {
-      /* explicit vectorization */
-      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
-                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
-      // process initial unaligned coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-      {
-        RhsScalar b = rhs(j, 0);
-        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
-        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
-      }
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if (peels>1)
-            {
-              /* Here we proccess 4 rows with with two peeled iterations to hide
-               * the overhead of unaligned loads. Moreover unaligned loads are handled
-               * using special shift/move operations between the two aligned packets
-               * overlaping the desired unaligned packet. This is *much* more efficient
-               * than basic unaligned loads.
-               */
-              LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
-              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
-              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-              for (; j<peeledSize; j+=peels*RhsPacketSize)
-              {
-                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
-                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
-                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
-                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
-                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
-                ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
-                ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
-                ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
-                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
-                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
-                ptmp1 = pcj.pmadd(A11, b, ptmp1);
-                ptmp2 = pcj.pmadd(A12, b, ptmp2);
-                ptmp3 = pcj.pmadd(A13, b, ptmp3);
-              }
-            }
-            for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
-            break;
-        }
-        tmp0 += predux(ptmp0);
-        tmp1 += predux(ptmp1);
-        tmp2 += predux(ptmp2);
-        tmp3 += predux(ptmp3);
-      }
-    } // end explicit vectorization
-    // process remaining coeffs (or all if no explicit vectorization)
-    // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<depth; ++j)
-    {
-      RhsScalar b = rhs(j, 0);
-      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
-      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    for (Index j = fullColBlockEnd; j < cols; ++j) {
+      RhsScalar b0 = rhs(j, 0);
+      cc0 += cj.pmul(lhs(i + 0, j), b0);
+      cc1 += cj.pmul(lhs(i + 1, j), b0);
+      cc2 += cj.pmul(lhs(i + 2, j), b0);
+      cc3 += cj.pmul(lhs(i + 3, j), b0);
     }
-    res[i*resIncr]            += alpha*tmp0;
-    res[(i+offset1)*resIncr]  += alpha*tmp1;
-    res[(i+2)*resIncr]        += alpha*tmp2;
-    res[(i+offset3)*resIncr]  += alpha*tmp3;
+    res[(i + 0) * resIncr] += alpha * cc0;
+    res[(i + 1) * resIncr] += alpha * cc1;
+    res[(i + 2) * resIncr] += alpha * cc2;
+    res[(i + 3) * resIncr] += alpha * cc3;
   }
+  for (; i < n2; i += 2) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
-  // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = rows;
-  Index start = rowBound;
-  do
-  {
-    for (Index i=start; i<end; ++i)
-    {
-      EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
-      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
-      // process first unaligned result's coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
-      if (alignedSize>alignedStart)
-      {
-        // process aligned rhs coeffs
-        if (lhs0.template aligned<LhsPacket>(alignedStart))
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
-        else
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
-        tmp0 += predux(ptmp0);
-      }
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
-      // process remaining scalars
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
-      res[i*resIncr] += alpha*tmp0;
-    }
-    if (skipRows)
-    {
-      start = 0;
-      end = skipRows;
-      skipRows = 0;
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
     }
-    else
-      break;
-  } while(Vectorizable);
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
-  #undef _EIGEN_ACCUMULATE_PACKETS
+    for (Index j = fullColBlockEnd; j < cols; ++j) {
+      RhsScalar b0 = rhs(j, 0);
+      cc0 += cj.pmul(lhs(i + 0, j), b0);
+      cc1 += cj.pmul(lhs(i + 1, j), b0);
+    }
+    res[(i + 0) * resIncr] += alpha * cc0;
+    res[(i + 1) * resIncr] += alpha * cc1;
+  }
+  for (; i < rows; ++i) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
+    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i, j), b0, c0);
+    }
+    ResScalar cc0 = predux(c0);
+    if (HasHalf) {
+      for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
+        RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
+        c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i, j), b0, c0_h);
+      }
+      cc0 += predux(c0_h);
+    }
+    if (HasQuarter) {
+      for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
+        RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
+        c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i, j), b0, c0_q);
+      }
+      cc0 += predux(c0_q);
+    }
+    for (Index j = quarterColBlockEnd; j < cols; ++j) {
+      cc0 += cj.pmul(lhs(i, j), rhs(j, 0));
+    }
+    res[i * resIncr] += alpha * cc0;
+  }
 }
-} // end namespace internal
+}  // end namespace internal
-} // end namespace Eigen
+}  // end namespace Eigen
-#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
+#endif  // EIGEN_GENERAL_MATRIX_VECTOR_H