npm - @smake/eigen - Versions diffs - 1.0.2 → 1.1.1 - Mend

@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (435) hide show

package/README.md +1 -1
package/eigen/Eigen/AccelerateSupport +52 -0
package/eigen/Eigen/Cholesky +18 -21
package/eigen/Eigen/CholmodSupport +28 -28
package/eigen/Eigen/Core +235 -326
package/eigen/Eigen/Eigenvalues +16 -14
package/eigen/Eigen/Geometry +21 -24
package/eigen/Eigen/Householder +9 -8
package/eigen/Eigen/IterativeLinearSolvers +8 -4
package/eigen/Eigen/Jacobi +14 -14
package/eigen/Eigen/KLUSupport +43 -0
package/eigen/Eigen/LU +16 -20
package/eigen/Eigen/MetisSupport +12 -12
package/eigen/Eigen/OrderingMethods +54 -54
package/eigen/Eigen/PaStiXSupport +23 -20
package/eigen/Eigen/PardisoSupport +17 -14
package/eigen/Eigen/QR +18 -21
package/eigen/Eigen/QtAlignedMalloc +5 -13
package/eigen/Eigen/SPQRSupport +21 -14
package/eigen/Eigen/SVD +23 -18
package/eigen/Eigen/Sparse +1 -4
package/eigen/Eigen/SparseCholesky +18 -23
package/eigen/Eigen/SparseCore +18 -17
package/eigen/Eigen/SparseLU +12 -8
package/eigen/Eigen/SparseQR +16 -14
package/eigen/Eigen/StdDeque +5 -2
package/eigen/Eigen/StdList +5 -2
package/eigen/Eigen/StdVector +5 -2
package/eigen/Eigen/SuperLUSupport +30 -24
package/eigen/Eigen/ThreadPool +80 -0
package/eigen/Eigen/UmfPackSupport +19 -17
package/eigen/Eigen/Version +14 -0
package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
package/eigen/Eigen/src/Core/Array.h +341 -294
package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
package/eigen/Eigen/src/Core/Assign.h +30 -40
package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
package/eigen/Eigen/src/Core/Block.h +375 -398
package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
package/eigen/Eigen/src/Core/DenseBase.h +632 -571
package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
package/eigen/Eigen/src/Core/Diagonal.h +169 -210
package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
package/eigen/Eigen/src/Core/Dot.h +172 -222
package/eigen/Eigen/src/Core/EigenBase.h +75 -85
package/eigen/Eigen/src/Core/Fill.h +138 -0
package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
package/eigen/Eigen/src/Core/IO.h +147 -139
package/eigen/Eigen/src/Core/IndexedView.h +321 -0
package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/Inverse.h +56 -66
package/eigen/Eigen/src/Core/Map.h +124 -142
package/eigen/Eigen/src/Core/MapBase.h +256 -281
package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
package/eigen/Eigen/src/Core/Matrix.h +491 -416
package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
package/eigen/Eigen/src/Core/NestByValue.h +66 -85
package/eigen/Eigen/src/Core/NoAlias.h +79 -85
package/eigen/Eigen/src/Core/NumTraits.h +235 -148
package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
package/eigen/Eigen/src/Core/Product.h +260 -139
package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
package/eigen/Eigen/src/Core/Random.h +161 -136
package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
package/eigen/Eigen/src/Core/RealView.h +250 -0
package/eigen/Eigen/src/Core/Redux.h +366 -336
package/eigen/Eigen/src/Core/Ref.h +308 -209
package/eigen/Eigen/src/Core/Replicate.h +94 -106
package/eigen/Eigen/src/Core/Reshaped.h +398 -0
package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
package/eigen/Eigen/src/Core/Reverse.h +136 -145
package/eigen/Eigen/src/Core/Select.h +70 -140
package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
package/eigen/Eigen/src/Core/Solve.h +97 -111
package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
package/eigen/Eigen/src/Core/SolverBase.h +138 -101
package/eigen/Eigen/src/Core/StableNorm.h +156 -160
package/eigen/Eigen/src/Core/StlIterators.h +619 -0
package/eigen/Eigen/src/Core/Stride.h +91 -88
package/eigen/Eigen/src/Core/Swap.h +70 -38
package/eigen/Eigen/src/Core/Transpose.h +295 -273
package/eigen/Eigen/src/Core/Transpositions.h +272 -317
package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
package/eigen/Eigen/src/Core/Visitor.h +480 -216
package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
package/eigen/Eigen/src/Core/util/Assert.h +158 -0
package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
package/eigen/Eigen/src/Core/util/Constants.h +314 -263
package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
package/eigen/Eigen/src/Core/util/Macros.h +939 -646
package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
package/eigen/Eigen/src/Core/util/Meta.h +618 -426
package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
package/eigen/Eigen/src/Geometry/Transform.h +896 -953
package/eigen/Eigen/src/Geometry/Translation.h +100 -98
package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
package/eigen/Eigen/src/Householder/Householder.h +104 -122
package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
package/eigen/Eigen/src/LU/Determinant.h +60 -63
package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
package/eigen/Eigen/src/StlSupport/details.h +48 -50
package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
package/eigen/Eigen/src/misc/Image.h +41 -43
package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/misc/Kernel.h +39 -41
package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
package/eigen/Eigen/src/misc/blas.h +83 -426
package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
package/lib/LibEigen.d.ts +4 -0
package/lib/LibEigen.js +14 -0
package/lib/index.d.ts +1 -1
package/lib/index.js +7 -3
package/package.json +2 -10
package/eigen/Eigen/CMakeLists.txt +0 -19
package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
package/eigen/Eigen/src/misc/lapack.h +0 -152
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
package/lib/eigen.d.ts +0 -2
package/lib/eigen.js +0 -15

package/eigen/Eigen/src/Core/ProductEvaluators.h CHANGED Viewed

@@ -9,155 +9,138 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_PRODUCTEVALUATORS_H
 #define EIGEN_PRODUCTEVALUATORS_H
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 namespace Eigen {
 namespace internal {
 /** \internal
-  * Evaluator of a product expression.
-  * Since products require special treatments to handle all possible cases,
-  * we simply deffer the evaluation logic to a product_evaluator class
-  * which offers more partial specialization possibilities.
-  *
-  * \sa class product_evaluator
-  */
-template<typename Lhs, typename Rhs, int Options>
-struct evaluator<Product<Lhs, Rhs, Options> >
- : public product_evaluator<Product<Lhs, Rhs, Options> >
-{
+ * Evaluator of a product expression.
+ * Since products require special treatments to handle all possible cases,
+ * we simply defer the evaluation logic to a product_evaluator class
+ * which offers more partial specialization possibilities.
+ *
+ * \sa class product_evaluator
+ */
+template <typename Lhs, typename Rhs, int Options>
+struct evaluator<Product<Lhs, Rhs, Options>> : public product_evaluator<Product<Lhs, Rhs, Options>> {
   typedef Product<Lhs, Rhs, Options> XprType;
   typedef product_evaluator<XprType> Base;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
-template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
-struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
                                                const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
-                                               const Product<Lhs, Rhs, DefaultProduct> > >
-{
+                                               const Product<Lhs, Rhs, DefaultProduct>>> {
   static const bool value = true;
 };
-template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
-struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
                                const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
-                               const Product<Lhs, Rhs, DefaultProduct> > >
- : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
-{
-  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
-                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
-                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
-  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
+                               const Product<Lhs, Rhs, DefaultProduct>>>
+    : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>> {
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                        const Product<Lhs, Rhs, DefaultProduct>>
+      XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>> Base;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
-    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
-  {}
+      : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {}
 };
-template<typename Lhs, typename Rhs, int DiagIndex>
-struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
- : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
-{
+template <typename Lhs, typename Rhs, int DiagIndex>
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex>>
+    : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>> {
   typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
-  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
+  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>> Base;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
-    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
-        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
-        xpr.index() ))
-  {}
+      : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
+            Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index())) {}
 };
 // Helper class to perform a matrix product with the destination at hand.
 // Depending on the sizes of the factors, there are different evaluation strategies
 // as controlled by internal::product_type.
-template< typename Lhs, typename Rhs,
-          typename LhsShape = typename evaluator_traits<Lhs>::Shape,
+template <typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape,
           typename RhsShape = typename evaluator_traits<Rhs>::Shape,
-          int ProductType = internal::product_type<Lhs,Rhs>::value>
+          int ProductType = internal::product_type<Lhs, Rhs>::value>
 struct generic_product_impl;
-template<typename Lhs, typename Rhs>
-struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+template <typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct>> {
   static const bool value = true;
 };
 // This is the default evaluator implementation for products:
 // It creates a temporary and call generic_product_impl
-template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+template <typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
 struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
-  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
-{
+    : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject> {
   typedef Product<Lhs, Rhs, Options> XprType;
   typedef typename XprType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
-  enum {
-    Flags = Base::Flags | EvalBeforeNestingBit
-  };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit product_evaluator(const XprType& xpr)
-    : m_result(xpr.rows(), xpr.cols())
-  {
-    ::new (static_cast<Base*>(this)) Base(m_result);
-// FIXME shall we handle nested_eval here?,
-// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
-//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
-//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-//
-//     const LhsNested lhs(xpr.lhs());
-//     const RhsNested rhs(xpr.rhs());
-//
-//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
+      : m_result(xpr.rows(), xpr.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    // FIXME shall we handle nested_eval here?,
+    // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in
+    // permutation_matrix_product, transposition_matrix_product, etc.)
+    //     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+    //     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+    //     typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
+    //     typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
+    //
+    //     const LhsNested lhs(xpr.lhs());
+    //     const RhsNested rhs(xpr.rhs());
+    //
+    //     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
     generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
   }
-protected:
+ protected:
   PlainObject m_result;
 };
-// The following three shortcuts are enabled only if the scalar types match excatly.
+// The following three shortcuts are enabled only if the scalar types match exactly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.
 // Dense = Product
-template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
-{
-  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
-  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
-  {
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::assign_op<Scalar, Scalar>, Dense2Dense,
+                  std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
+  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const internal::assign_op<Scalar, Scalar>&) {
     Index dstRows = src.rows();
     Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
     // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
   }
 };
 // Dense += Product
-template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
-{
-  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
-  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
-  {
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::add_assign_op<Scalar, Scalar>, Dense2Dense,
+                  std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
+  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const internal::add_assign_op<Scalar, Scalar>&) {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
@@ -165,35 +148,35 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
 };
 // Dense -= Product
-template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
-{
-  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
-  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
-  {
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::sub_assign_op<Scalar, Scalar>, Dense2Dense,
+                  std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
+  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const internal::sub_assign_op<Scalar, Scalar>&) {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
   }
 };
 // Dense ?= scalar * Product
 // TODO we should apply that rule if that's really helpful
 // for instance, this is not good for inner products
-template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
-{
-  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
-                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
-                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
-  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
-  {
-    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
+template <typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis,
+          typename Plain>
+struct Assignment<DstXprType,
+                  CwiseBinaryOp<internal::scalar_product_op<ScalarBis, Scalar>,
+                                const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
+                                const Product<Lhs, Rhs, DefaultProduct>>,
+                  AssignFunc, Dense2Dense> {
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis, Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
+                        const Product<Lhs, Rhs, DefaultProduct>>
+      SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const AssignFunc& func) {
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs()) * src.rhs().rhs(), func);
   }
 };
@@ -201,251 +184,291 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
 // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
-template<typename OtherXpr, typename Lhs, typename Rhs>
-struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
-                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+template <typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<
+    CwiseBinaryOp<
+        internal::scalar_sum_op<typename OtherXpr::Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
+        const OtherXpr, const Product<Lhs, Rhs, DefaultProduct>>,
+    DenseShape> {
   static const bool value = true;
 };
-template<typename OtherXpr, typename Lhs, typename Rhs>
-struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
-                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+template <typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<
+    CwiseBinaryOp<
+        internal::scalar_difference_op<typename OtherXpr::Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
+        const OtherXpr, const Product<Lhs, Rhs, DefaultProduct>>,
+    DenseShape> {
   static const bool value = true;
 };
-template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
-struct assignment_from_xpr_op_product
-{
-  template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_STRONG_INLINE
-  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
-  {
+template <typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product {
+  template <typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const InitialFunc& /*func*/) {
     call_assignment_no_alias(dst, src.lhs(), Func1());
     call_assignment_no_alias(dst, src.rhs(), Func2());
   }
 };
-#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
-  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
-  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
-                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
-    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
-  {}
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP, BINOP, ASSIGN_OP2)                             \
+  template <typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, \
+            typename SrcScalar, typename OtherScalar, typename ProdScalar>                          \
+  struct Assignment<DstXprType,                                                                     \
+                    CwiseBinaryOp<internal::BINOP<OtherScalar, ProdScalar>, const OtherXpr,         \
+                                  const Product<Lhs, Rhs, DefaultProduct>>,                         \
+                    internal::ASSIGN_OP<DstScalar, SrcScalar>, Dense2Dense>                         \
+      : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs, Rhs, DefaultProduct>,     \
+                                       internal::ASSIGN_OP<DstScalar, OtherScalar>,                 \
+                                       internal::ASSIGN_OP2<DstScalar, ProdScalar>> {}
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op, scalar_sum_op, add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op, scalar_sum_op, add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op, scalar_sum_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op, scalar_difference_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op, scalar_difference_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op, scalar_difference_op, add_assign_op);
-EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
-EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
-EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
+//----------------------------------------
-EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
-EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
-EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, InnerProduct> {
+  using impl = default_inner_product_impl<Lhs, Rhs, false>;
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.coeffRef(0, 0) = impl::run(lhs, rhs);
+  }
-//----------------------------------------
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.coeffRef(0, 0) += impl::run(lhs, rhs);
+  }
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
-{
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-  }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
-    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
-  }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.coeffRef(0, 0) -= impl::run(lhs, rhs);
+  }
 };
 /***********************************************************************
-*  Implementation of outer dense * dense vector product
-***********************************************************************/
+ *  Implementation of outer dense * dense vector product
+ ***********************************************************************/
 // Column major result
-template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
-{
+template <typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                  const false_type&) {
   evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs, lhs, Rhs::SizeAtCompileTime, actual_lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
+  for (Index j = 0; j < cols; ++j) func(dst.col(j), rhsEval.coeff(Index(0), j) * actual_lhs);
 }
 // Row major result
-template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
-{
+template <typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                  const true_type&) {
   evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs, rhs, Lhs::SizeAtCompileTime, actual_rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
+  for (Index i = 0; i < rows; ++i) func(dst.row(i), lhsEval.coeff(i, Index(0)) * actual_rhs);
 }
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
-{
-  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
+  template <typename T>
+  struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set {
+    template <typename Dst, typename Src>
+    EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() = src;
+    }
+  };
+  struct add {
+    /** Add to dst. */
+    template <typename Dst, typename Src>
+    EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += src;
+    }
+  };
+  struct sub {
+    template <typename Dst, typename Src>
+    EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() -= src;
+    }
+  };
+  /** Scaled add. */
   struct adds {
     Scalar m_scale;
+    /** Constructor */
     explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+    /** Scaled add to dst. */
+    template <typename Dst, typename Src>
+    void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += m_scale * src;
     }
   };
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
   }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
   }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
   }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                  const Scalar& alpha) {
     internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
   }
 };
 // This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
-template<typename Lhs, typename Rhs, typename Derived>
-struct generic_product_impl_base
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
+template <typename Lhs, typename Rhs, typename Derived>
+struct generic_product_impl_base {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.setZero();
+    scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+  }
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+  }
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                  const Scalar& alpha) {
+    Derived::scaleAndAddTo(dst, lhs, rhs, alpha);
+  }
 };
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
-  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
-{
-  typedef typename nested_eval<Lhs,1>::type LhsNested;
-  typedef typename nested_eval<Rhs,1>::type RhsNested;
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>> {
+  typedef typename nested_eval<Lhs, 1>::type LhsNested;
+  typedef typename nested_eval<Rhs, 1>::type RhsNested;
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
   enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
-  template<typename Dest>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
+  typedef internal::remove_all_t<std::conditional_t<int(Side) == OnTheRight, LhsNested, RhsNested>> MatrixType;
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                  const Scalar& alpha) {
+    // Fallback to inner product if both the lhs and rhs is a runtime vector.
+    if (lhs.rows() == 1 && rhs.cols() == 1) {
+      dst.coeffRef(0, 0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
+      return;
+    }
     LhsNested actual_lhs(lhs);
     RhsNested actual_rhs(rhs);
-    internal::gemv_dense_selector<Side,
-                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
-                           >::run(actual_lhs, actual_rhs, dst, alpha);
+    internal::gemv_dense_selector<Side, (int(MatrixType::Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                  bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(actual_lhs,
+                                                                                                       actual_rhs, dst,
+                                                                                                       alpha);
   }
 };
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
     // but easier on the compiler side
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar, Scalar>());
   }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     // dst.noalias() += lhs.lazyProduct(rhs);
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar, Scalar>());
   }
-  template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
     // dst.noalias() -= lhs.lazyProduct(rhs);
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
-  }
-  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
-  //    dst {,+,-}= s * (A.lazyProduct(B))
-  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
-  // For them, this strategy is also faster than simply by-passing the heap allocation through
-  // stack allocation.
-  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
-  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
-  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
-  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
-                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
-  {
-    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
-  }
-  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
-  // overload more specialized.
-  template<typename Dst, typename LhsT, typename Func>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
-  {
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
-  }
-//   template<typename Dst>
-//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar, Scalar>());
+  }
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, however, the gains are less obvious, it is sometimes x2 faster, but sometimes x3
+  // slower, and the behavior depends also a lot on the compiler... This is why this re-writing strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template <typename Dst, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                 const Func& func) {
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+      ConjRhs = blas_traits<Rhs>::NeedToConjugate
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
+    eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
+                      bool_constant<HasScalarFactor>());
+  }
+ protected:
+  template <typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs,
+                                                                      const Func& func, const Scalar& s /* == 1 */,
+                                                                      false_type) {
+    EIGEN_UNUSED_VARIABLE(s);
+    eigen_internal_assert(numext::is_exactly_one(s));
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+  template <typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs,
+                                                                      const Func& func, const Scalar& s, true_type) {
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+  }
 };
 // This specialization enforces the use of a coefficient-based evaluation strategy
-template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductMode>
-  : generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> {};
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, LazyCoeffBasedProductMode>
+    : generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> {};
 // Case 2: Evaluate coeff by coeff
 //
@@ -453,29 +476,27 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductM
 // The main difference is that we add an extra argument to the etor_product_*_impl::run() function
 // for the inner dimension of the product, because evaluator object do not know their size.
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+template <int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct etor_product_coeff_impl;
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+template <int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl;
-template<typename Lhs, typename Rhs, int ProductTag>
+template <typename Lhs, typename Rhs, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
-    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
-{
+    : evaluator_base<Product<Lhs, Rhs, LazyProduct>> {
   typedef Product<Lhs, Rhs, LazyProduct> XprType;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit product_evaluator(const XprType& xpr)
-    : m_lhs(xpr.lhs()),
-      m_rhs(xpr.rhs()),
-      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
-      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
-                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
-      m_innerDim(xpr.lhs().cols())
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
+      : m_lhs(xpr.lhs()),
+        m_rhs(xpr.rhs()),
+        m_lhsImpl(m_lhs),  // FIXME the creation of the evaluator objects should result in a no-op, but check that!
+        m_rhsImpl(m_rhs),  //       Moreover, they are only useful for the packet path, so we could completely disable
+                           //       them when not needed, or perhaps declare them on the fly on the packet method... We
+                           //       have experiment to check what's best.
+        m_innerDim(xpr.lhs().cols()) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -495,11 +516,11 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   // Everything below here is taken from CoeffBasedProduct.h
-  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
-  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+  typedef typename internal::nested_eval<Lhs, Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+  typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
+  typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
   typedef evaluator<LhsNestedCleaned> LhsEtorType;
   typedef evaluator<RhsNestedCleaned> RhsEtorType;
@@ -507,28 +528,29 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   enum {
     RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
     ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    InnerSize = min_size_prefer_fixed(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
     MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
   };
-  typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
-  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
+  typedef typename find_best_packet<Scalar, RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar, ColsAtCompileTime>::type RhsVecPacketType;
   enum {
     LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
     RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
-    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
-                  : InnerSize == Dynamic ? HugeCost
-                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+    CoeffReadCost = InnerSize == 0 ? NumTraits<Scalar>::ReadCost
+                    : InnerSize == Dynamic
+                        ? HugeCost
+                        : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) +
+                              (InnerSize - 1) * NumTraits<Scalar>::AddCost,
     Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
     LhsFlags = LhsEtorType::Flags,
     RhsFlags = RhsEtorType::Flags,
     LhsRowMajor = LhsFlags & RowMajorBit,
     RhsRowMajor = RhsFlags & RowMajorBit,
@@ -536,82 +558,105 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
     // Here, we don't care about alignment larger than the usable packet size.
-    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
-    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
-    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
-    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
-    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),
-    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
-    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
-          | (EvalToRowMajor ? RowMajorBit : 0)
-          // TODO enable vectorization for mixed types
-          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
-          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
-    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
-    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
-    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
-              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
-              : 0,
+    LhsAlignment =
+        plain_enum_min(LhsEtorType::Alignment, LhsVecPacketSize* int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment =
+        plain_enum_min(RhsEtorType::Alignment, RhsVecPacketSize* int(sizeof(typename RhsNestedCleaned::Scalar))),
+    SameType = is_same<typename LhsNestedCleaned::Scalar, typename RhsNestedCleaned::Scalar>::value,
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime != 1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime != 1),
+    EvalToRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1) ? 1
+                     : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1)
+                         ? 0
+                         : (bool(RhsRowMajor) && !CanVectorizeLhs),
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) |
+            (EvalToRowMajor ? RowMajorBit : 0)
+            // TODO enable vectorization for mixed types
+            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) |
+            (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
+    LhsOuterStrideBytes =
+        int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes =
+        int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+    Alignment = bool(CanVectorizeLhs)
+                    ? (LhsOuterStrideBytes <= 0 || (int(LhsOuterStrideBytes) % plain_enum_max(1, LhsAlignment)) != 0
+                           ? 0
+                           : LhsAlignment)
+                : bool(CanVectorizeRhs)
+                    ? (RhsOuterStrideBytes <= 0 || (int(RhsOuterStrideBytes) % plain_enum_max(1, RhsAlignment)) != 0
+                           ? 0
+                           : RhsAlignment)
+                    : 0,
     /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
      */
-    CanVectorizeInner =    SameType
-                        && LhsRowMajor
-                        && (!RhsRowMajor)
-                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (InnerSize % packet_traits<Scalar>::size == 0)
+    CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) &&
+                        (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) &&
+                        (int(InnerSize) % packet_traits<Scalar>::size == 0)
   };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
-  {
-    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const {
+    return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
   }
   /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
    * which is why we don't set the LinearAccessBit.
    * TODO: this seems possible when the result is a vector
    */
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
-  {
-    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
-    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
-    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
   }
-  template<int LoadMode, typename PacketType>
-  const PacketType packet(Index row, Index col) const
-  {
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const {
     PacketType res;
-    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                                     Unroll ? int(InnerSize) : Dynamic,
-                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
+    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+        PacketImpl;
     PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
     return res;
   }
-  template<int LoadMode, typename PacketType>
-  const PacketType packet(Index index) const
-  {
-    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
-    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
-    return packet<LoadMode,PacketType>(row,col);
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return packet<LoadMode, PacketType>(row, col);
   }
-protected:
-  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
-  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
+                                                                       Index count) const {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+        PacketImpl;
+    PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
+    return res;
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+ protected:
+  add_const_on_value_type_t<LhsNested> m_lhs;
+  add_const_on_value_type_t<RhsNested> m_rhs;
   LhsEtorType m_lhsImpl;
   RhsEtorType m_rhsImpl;
@@ -619,520 +664,624 @@ protected:
   Index m_innerDim;
 };
-template<typename Lhs, typename Rhs>
+template <typename Lhs, typename Rhs>
 struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
-  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
-{
+    : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape> {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
   typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
   typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
-  enum {
-    Flags = Base::Flags | EvalBeforeNestingBit
-  };
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
-  {}
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
+      : Base(BaseProduct(xpr.lhs(), xpr.rhs())) {}
 };
 /****************************************
 *** Coeff based product, Packet path  ***
 ****************************************/
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
-  {
-    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
+template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs,
+                                                                                            innerDim, res);
+    res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+                rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+                rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
   }
 };
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
-  {
-    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
+template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs,
+                                                                                            innerDim, res);
+    res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
+                pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
+                pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
   }
 };
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index /*innerDim*/, Packet& res) {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
+               rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
   }
 };
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index /*innerDim*/, Packet& res) {
+    res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
+               pset1<Packet>(rhs.coeff(Index(0), col)));
   }
 };
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
-  {
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                        const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
-  {
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                        const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
-  {
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
-    for(Index i = 0; i < innerDim; ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
+                  res);
   }
 };
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
-  {
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
-    for(Index i = 0; i < innerDim; ++i)
-      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
+                  res);
   }
 };
 /***************************************************************************
-* Triangular products
-***************************************************************************/
-template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
+ * Triangular products
+ ***************************************************************************/
+template <int Mode, bool LhsIsTriangular, typename Lhs, bool LhsIsVector, typename Rhs, bool RhsIsVector>
 struct triangular_product_impl;
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
-  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    triangular_product_impl<Lhs::Mode,true,typename Lhs::MatrixType,false,Rhs, Rhs::ColsAtCompileTime==1>
-        ::run(dst, lhs.nestedExpression(), rhs, alpha);
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    triangular_product_impl<Lhs::Mode, true, typename Lhs::MatrixType, false, Rhs, Rhs::ColsAtCompileTime == 1>::run(
+        dst, lhs.nestedExpression(), rhs, alpha);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
-: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    triangular_product_impl<Rhs::Mode,false,Lhs,Lhs::RowsAtCompileTime==1, typename Rhs::MatrixType, false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, DenseShape, TriangularShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, TriangularShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    triangular_product_impl<Rhs::Mode, false, Lhs, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, false>::run(
+        dst, lhs, rhs.nestedExpression(), alpha);
   }
 };
 /***************************************************************************
-* SelfAdjoint products
-***************************************************************************/
-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
+ * SelfAdjoint products
+ ***************************************************************************/
+template <typename Lhs, int LhsMode, bool LhsIsVector, typename Rhs, int RhsMode, bool RhsIsVector>
 struct selfadjoint_product_impl;
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
-  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::ColsAtCompileTime == 1>::run(
+        dst, lhs.nestedExpression(), rhs, alpha);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag>
-struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
-: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
-{
-  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-  {
-    selfadjoint_product_impl<Lhs,0,Lhs::IsVectorAtCompileTime,typename Rhs::MatrixType,Rhs::Mode,false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    selfadjoint_product_impl<Lhs, 0, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, Rhs::Mode, false>::run(
+        dst, lhs, rhs.nestedExpression(), alpha);
   }
 };
 /***************************************************************************
-* Diagonal products
-***************************************************************************/
-template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
-struct diagonal_product_evaluator_base
-  : evaluator_base<Derived>
-{
-   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-public:
+ * Diagonal products
+ ***************************************************************************/
+template <typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
+struct diagonal_product_evaluator_base : evaluator_base<Derived> {
+  typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+ public:
   enum {
-    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) +
+                    int(evaluator<DiagonalType>::CoeffReadCost),
     MatrixFlags = evaluator<MatrixType>::Flags,
     DiagFlags = evaluator<DiagonalType>::Flags,
-    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    StorageOrder_ = (Derived::MaxRowsAtCompileTime == 1 && Derived::MaxColsAtCompileTime != 1)   ? RowMajor
+                    : (Derived::MaxColsAtCompileTime == 1 && Derived::MaxRowsAtCompileTime != 1) ? ColMajor
+                    : MatrixFlags & RowMajorBit                                                  ? RowMajor
+                                                                                                 : ColMajor,
+    SameStorageOrder_ = int(StorageOrder_) == ((MatrixFlags & RowMajorBit) ? RowMajor : ColMajor),
+    ScalarAccessOnDiag_ = !((int(StorageOrder_) == ColMajor && int(ProductOrder) == OnTheLeft) ||
+                            (int(StorageOrder_) == RowMajor && int(ProductOrder) == OnTheRight)),
+    SameTypes_ = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
     // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
-    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    // Vectorizable_ = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (SameTypes_ &&
+    // bool(int(DiagFlags)&PacketAccessBit))),
+    Vectorizable_ = bool(int(MatrixFlags) & PacketAccessBit) && SameTypes_ &&
+                    (SameStorageOrder_ || (MatrixFlags & LinearAccessBit) == LinearAccessBit) &&
+                    (ScalarAccessOnDiag_ || (bool(int(DiagFlags) & PacketAccessBit))),
+    LinearAccessMask_ =
+        (MatrixType::RowsAtCompileTime == 1 || MatrixType::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    Flags =
+        ((HereditaryBits | LinearAccessMask_) & (unsigned int)(MatrixFlags)) | (Vectorizable_ ? PacketAccessBit : 0),
     Alignment = evaluator<MatrixType>::Alignment,
-    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
-                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
-                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
+    AsScalarProduct =
+        (DiagonalType::SizeAtCompileTime == 1) ||
+        (DiagonalType::SizeAtCompileTime == Dynamic && MatrixType::RowsAtCompileTime == 1 &&
+         ProductOrder == OnTheLeft) ||
+        (DiagonalType::SizeAtCompileTime == Dynamic && MatrixType::ColsAtCompileTime == 1 && ProductOrder == OnTheRight)
   };
-  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
-    : m_diagImpl(diag), m_matImpl(mat)
-  {
+  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType& mat, const DiagonalType& diag)
+      : m_diagImpl(diag), m_matImpl(mat) {
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-  {
-    if(AsScalarProduct)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const {
+    if (AsScalarProduct)
       return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
     else
       return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
-protected:
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
-  {
-    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+ protected:
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const {
+    return internal::pmul(m_matImpl.template packet<LoadMode, PacketType>(row, col),
                           internal::pset1<PacketType>(m_diagImpl.coeff(id)));
   }
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
-  {
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const {
     enum {
       InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
+      DiagonalPacketLoadMode = plain_enum_min(
+          LoadMode,
+          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
     };
-    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
-                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
+    return internal::pmul(m_matImpl.template packet<LoadMode, PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
   }
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::true_type) const {
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::false_type) const {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = plain_enum_min(
+          LoadMode,
+          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
+  }
   evaluator<DiagonalType> m_diagImpl;
-  evaluator<MatrixType>   m_matImpl;
+  evaluator<MatrixType> m_matImpl;
 };
 // diagonal * dense
-template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+template <typename Lhs, typename Rhs, int ProductKind, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
-  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
-{
-  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
+    : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                      OnTheLeft> {
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                          OnTheLeft>
+      Base;
+  using Base::coeff;
   using Base::m_diagImpl;
   using Base::m_matImpl;
-  using Base::coeff;
   typedef typename Base::Scalar Scalar;
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
-  enum {
-    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
-  };
+  typedef typename Lhs::DiagonalVectorType DiagonalType;
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : Base(xpr.rhs(), xpr.lhs().diagonal())
-  {
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-  {
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
-#ifndef __CUDACC__
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
-  {
+#ifndef EIGEN_GPUCC
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
     // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
     // See also similar calls below.
-    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
-                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
+    return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
   }
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
-  {
-    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
+    return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
+                                        int(StorageOrder) == ColMajor ? 0 : idx);
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
   }
 #endif
 };
 // dense * diagonal
-template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+template <typename Lhs, typename Rhs, int ProductKind, int ProductTag>
 struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
-  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
-{
-  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
+    : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                      OnTheRight> {
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                          OnTheRight>
+      Base;
+  using Base::coeff;
   using Base::m_diagImpl;
   using Base::m_matImpl;
-  using Base::coeff;
   typedef typename Base::Scalar Scalar;
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
-  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
-    : Base(xpr.lhs(), xpr.rhs().diagonal())
-  {
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-  {
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
-#ifndef __CUDACC__
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
-  {
-    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
-                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
+#ifndef EIGEN_GPUCC
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
   }
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
-  {
-    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
+    return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
   }
 #endif
 };
 /***************************************************************************
-* Products with permutation matrices
-***************************************************************************/
+ * Products with permutation matrices
+ ***************************************************************************/
 /** \internal
-  * \class permutation_matrix_product
-  * Internal helper class implementing the product between a permutation matrix and a matrix.
-  * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
-  */
-template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+ * \class permutation_matrix_product
+ * Internal helper class implementing the product between a permutation matrix and a matrix.
+ * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+ */
+template <typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
 struct permutation_matrix_product;
-template<typename ExpressionType, int Side, bool Transposed>
-struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
-{
-    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
-    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-    template<typename Dest, typename PermutationType>
-    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
-    {
-      MatrixType mat(xpr);
-      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
-      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
-      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      //if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
-      if(is_same_dense(dst, mat))
-      {
-        // apply the permutation inplace
-        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(perm.size());
-        mask.fill(false);
-        Index r = 0;
-        while(r < perm.size())
-        {
-          // search for the next seed
-          while(r<perm.size() && mask[r]) r++;
-          if(r>=perm.size())
-            break;
-          // we got one, let's follow it until we are back to the seed
-          Index k0 = r++;
-          Index kPrev = k0;
-          mask.coeffRef(k0) = true;
-          for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k))
-          {
-                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
-            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
-            mask.coeffRef(k) = true;
-            kPrev = k;
-          }
-        }
-      }
-      else
-      {
-        for(Index i = 0; i < n; ++i)
-        {
-          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-               (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
-          =
+template <typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape> {
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
-          Block<const MatrixTypeCleaned,Side==OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixTypeCleaned::ColsAtCompileTime>
-               (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+  template <typename Dest, typename PermutationType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm,
+                                                        const ExpressionType& xpr) {
+    MatrixType mat(xpr);
+    const Index n = Side == OnTheLeft ? mat.rows() : mat.cols();
+    // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+    // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+    // if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+    if (is_same_dense(dst, mat)) {
+      // apply the permutation inplace
+      Matrix<bool, PermutationType::RowsAtCompileTime, 1, 0, PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+      mask.fill(false);
+      Index r = 0;
+      while (r < perm.size()) {
+        // search for the next seed
+        while (r < perm.size() && mask[r]) r++;
+        if (r >= perm.size()) break;
+        // we got one, let's follow it until we are back to the seed
+        Index k0 = r++;
+        Index kPrev = k0;
+        mask.coeffRef(k0) = true;
+        for (Index k = perm.indices().coeff(k0); k != k0; k = perm.indices().coeff(k)) {
+          Block<Dest, Side == OnTheLeft ? 1 : Dest::RowsAtCompileTime,
+                Side == OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+              .swap(Block < Dest, Side == OnTheLeft ? 1 : Dest::RowsAtCompileTime,
+                    Side == OnTheRight
+                        ? 1
+                        : Dest::ColsAtCompileTime > (dst, ((Side == OnTheLeft) ^ Transposed) ? k0 : kPrev));
+          mask.coeffRef(k) = true;
+          kPrev = k;
         }
       }
+    } else {
+      for (Index i = 0; i < n; ++i) {
+        Block<Dest, Side == OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side == OnTheRight ? 1 : Dest::ColsAtCompileTime>(
+            dst, ((Side == OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+            =
+                Block < const MatrixTypeCleaned,
+            Side == OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,
+            Side == OnTheRight ? 1
+                               : MatrixTypeCleaned::ColsAtCompileTime >
+                                     (mat, ((Side == OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+      }
     }
+  }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
     permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
     permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs) {
     permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs) {
     permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
   }
 };
 /***************************************************************************
-* Products with transpositions matrices
-***************************************************************************/
+ * Products with transpositions matrices
+ ***************************************************************************/
 // FIXME could we unify Transpositions and Permutation into a single "shape"??
 /** \internal
-  * \class transposition_matrix_product
-  * Internal helper class implementing the product between a permutation matrix and a matrix.
-  */
-template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
-struct transposition_matrix_product
-{
+ * \class transposition_matrix_product
+ * Internal helper class implementing the product between a permutation matrix and a matrix.
+ */
+template <typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product {
   typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
-  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-  template<typename Dest, typename TranspositionType>
-  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
-  {
+  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+  template <typename Dest, typename TranspositionType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr,
+                                                        const ExpressionType& xpr) {
     MatrixType mat(xpr);
     typedef typename TranspositionType::StorageIndex StorageIndex;
     const Index size = tr.size();
     StorageIndex j = 0;
-    if(!is_same_dense(dst,mat))
-      dst = mat;
+    if (!is_same_dense(dst, mat)) dst = mat;
-    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-      if(Index(j=tr.coeff(k))!=k)
-      {
-        if(Side==OnTheLeft)        dst.row(k).swap(dst.row(j));
-        else if(Side==OnTheRight)  dst.col(k).swap(dst.col(j));
+    for (Index k = (Transposed ? size - 1 : 0); Transposed ? k >= 0 : k < size; Transposed ? --k : ++k)
+      if (Index(j = tr.coeff(k)) != k) {
+        if (Side == OnTheLeft)
+          dst.row(k).swap(dst.row(j));
+        else if (Side == OnTheRight)
+          dst.col(k).swap(dst.col(j));
       }
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
     transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
     transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs) {
     transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
   }
 };
-template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
-{
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
-  {
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs) {
     transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
   }
 };
-} // end namespace internal
+/***************************************************************************
+ * skew symmetric products
+ * for now we just call the generic implementation
+ ***************************************************************************/
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, SkewSymmetricShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    generic_product_impl<typename Lhs::DenseMatrixType, Rhs, DenseShape, MatrixShape, ProductTag>::evalTo(dst, lhs,
+                                                                                                          rhs);
+  }
+};
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, SkewSymmetricShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    generic_product_impl<Lhs, typename Rhs::DenseMatrixType, MatrixShape, DenseShape, ProductTag>::evalTo(dst, lhs,
+                                                                                                          rhs);
+  }
+};
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, SkewSymmetricShape, SkewSymmetricShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    generic_product_impl<typename Lhs::DenseMatrixType, typename Rhs::DenseMatrixType, DenseShape, DenseShape,
+                         ProductTag>::evalTo(dst, lhs, rhs);
+  }
+};
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, typename Rhs::PlainObject, MatrixShape, DenseShape, ProductTag> {};
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, MatrixShape, ProductTag>
+    : generic_product_impl<typename Lhs::PlainObject, Rhs, DenseShape, MatrixShape, ProductTag> {};
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, PermutationShape, DenseShape, ProductTag> {};
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, PermutationShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, DenseShape, PermutationShape, ProductTag> {};
+}  // end namespace internal
-} // end namespace Eigen
+}  // end namespace Eigen
-#endif // EIGEN_PRODUCT_EVALUATORS_H
+#endif  // EIGEN_PRODUCT_EVALUATORS_H