npm - @smake/eigen - Versions diffs - 1.1.0 → 1.1.1 - Mend

@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (431) hide show

package/README.md +1 -1
package/eigen/Eigen/AccelerateSupport +52 -0
package/eigen/Eigen/Cholesky +18 -20
package/eigen/Eigen/CholmodSupport +28 -28
package/eigen/Eigen/Core +187 -120
package/eigen/Eigen/Eigenvalues +16 -13
package/eigen/Eigen/Geometry +18 -18
package/eigen/Eigen/Householder +9 -7
package/eigen/Eigen/IterativeLinearSolvers +8 -4
package/eigen/Eigen/Jacobi +14 -13
package/eigen/Eigen/KLUSupport +23 -21
package/eigen/Eigen/LU +15 -16
package/eigen/Eigen/MetisSupport +12 -12
package/eigen/Eigen/OrderingMethods +54 -51
package/eigen/Eigen/PaStiXSupport +23 -21
package/eigen/Eigen/PardisoSupport +17 -14
package/eigen/Eigen/QR +18 -20
package/eigen/Eigen/QtAlignedMalloc +5 -12
package/eigen/Eigen/SPQRSupport +21 -14
package/eigen/Eigen/SVD +23 -17
package/eigen/Eigen/Sparse +1 -2
package/eigen/Eigen/SparseCholesky +18 -15
package/eigen/Eigen/SparseCore +18 -17
package/eigen/Eigen/SparseLU +9 -9
package/eigen/Eigen/SparseQR +16 -14
package/eigen/Eigen/StdDeque +5 -2
package/eigen/Eigen/StdList +5 -2
package/eigen/Eigen/StdVector +5 -2
package/eigen/Eigen/SuperLUSupport +30 -24
package/eigen/Eigen/ThreadPool +80 -0
package/eigen/Eigen/UmfPackSupport +19 -17
package/eigen/Eigen/Version +14 -0
package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
package/eigen/Eigen/src/Core/Array.h +329 -370
package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
package/eigen/Eigen/src/Core/Assign.h +30 -40
package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
package/eigen/Eigen/src/Core/Block.h +371 -390
package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
package/eigen/Eigen/src/Core/DenseBase.h +630 -658
package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
package/eigen/Eigen/src/Core/Diagonal.h +168 -207
package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
package/eigen/Eigen/src/Core/Dot.h +167 -217
package/eigen/Eigen/src/Core/EigenBase.h +74 -85
package/eigen/Eigen/src/Core/Fill.h +138 -0
package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
package/eigen/Eigen/src/Core/IO.h +131 -156
package/eigen/Eigen/src/Core/IndexedView.h +209 -125
package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/Inverse.h +50 -59
package/eigen/Eigen/src/Core/Map.h +123 -141
package/eigen/Eigen/src/Core/MapBase.h +255 -282
package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
package/eigen/Eigen/src/Core/Matrix.h +463 -494
package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
package/eigen/Eigen/src/Core/NestByValue.h +58 -52
package/eigen/Eigen/src/Core/NoAlias.h +79 -86
package/eigen/Eigen/src/Core/NumTraits.h +206 -206
package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
package/eigen/Eigen/src/Core/Product.h +246 -130
package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
package/eigen/Eigen/src/Core/Random.h +153 -164
package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
package/eigen/Eigen/src/Core/RealView.h +250 -0
package/eigen/Eigen/src/Core/Redux.h +334 -314
package/eigen/Eigen/src/Core/Ref.h +259 -257
package/eigen/Eigen/src/Core/Replicate.h +92 -104
package/eigen/Eigen/src/Core/Reshaped.h +215 -271
package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
package/eigen/Eigen/src/Core/Reverse.h +133 -148
package/eigen/Eigen/src/Core/Select.h +68 -140
package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
package/eigen/Eigen/src/Core/Solve.h +88 -102
package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
package/eigen/Eigen/src/Core/SolverBase.h +132 -133
package/eigen/Eigen/src/Core/StableNorm.h +113 -147
package/eigen/Eigen/src/Core/StlIterators.h +404 -248
package/eigen/Eigen/src/Core/Stride.h +90 -92
package/eigen/Eigen/src/Core/Swap.h +70 -39
package/eigen/Eigen/src/Core/Transpose.h +258 -295
package/eigen/Eigen/src/Core/Transpositions.h +270 -333
package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
package/eigen/Eigen/src/Core/Visitor.h +464 -308
package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
package/eigen/Eigen/src/Core/util/Assert.h +158 -0
package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
package/eigen/Eigen/src/Core/util/Constants.h +297 -262
package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
package/eigen/Eigen/src/Core/util/Macros.h +655 -773
package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
package/eigen/Eigen/src/Core/util/Memory.h +970 -748
package/eigen/Eigen/src/Core/util/Meta.h +581 -633
package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
package/eigen/Eigen/src/Geometry/Transform.h +858 -936
package/eigen/Eigen/src/Geometry/Translation.h +94 -92
package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
package/eigen/Eigen/src/Householder/Householder.h +102 -124
package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
package/eigen/Eigen/src/LU/Determinant.h +50 -69
package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
package/eigen/Eigen/src/StlSupport/details.h +48 -50
package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
package/eigen/Eigen/src/misc/Image.h +41 -43
package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/misc/Kernel.h +39 -41
package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
package/eigen/Eigen/src/misc/blas.h +83 -426
package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
package/package.json +1 -1
package/eigen/COPYING.APACHE +0 -203
package/eigen/COPYING.BSD +0 -26
package/eigen/COPYING.GPL +0 -674
package/eigen/COPYING.LGPL +0 -502
package/eigen/COPYING.MINPACK +0 -51
package/eigen/COPYING.MPL2 +0 -373
package/eigen/COPYING.README +0 -18
package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
package/eigen/Eigen/src/misc/lapack.h +0 -152
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
package/eigen/README.md +0 -5

package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h ADDED Viewed

@@ -0,0 +1,1245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Intel Corporation
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
+#if EIGEN_COMP_MSVC
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#include <immintrin.h>
+#include <type_traits>
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+#if !defined(EIGEN_USE_AVX512_GEMM_KERNELS)
+#define EIGEN_USE_AVX512_GEMM_KERNELS 1
+#endif
+#define SECOND_FETCH (32)
+#if (EIGEN_COMP_GNUC_STRICT != 0) && !defined(EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS)
+// Use less registers to load A elements to workaround compiler spills. Loose a
+// bit of performance (less than ~2%).
+#define EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS
+#endif
+namespace Eigen {
+namespace internal {
+template <typename Scalar, bool is_unit_inc>
+class gemm_class {
+  using vec = typename packet_traits<Scalar>::type;
+  using vec_ymm = typename unpacket_traits<vec>::half;
+  using vec_xmm = typename unpacket_traits<vec_ymm>::half;
+  using umask_t = typename unpacket_traits<vec>::mask_t;
+  static constexpr bool is_f32 = sizeof(Scalar) == sizeof(float);
+  static constexpr bool is_f64 = sizeof(Scalar) == sizeof(double);
+#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS
+  static constexpr bool use_less_a_regs = !is_unit_inc;
+#else
+  static constexpr bool use_less_a_regs = true;
+#endif
+#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_B_REGS
+  static constexpr bool use_less_b_regs = !is_unit_inc;
+#else
+  static constexpr bool use_less_b_regs = true;
+#endif
+  static constexpr int a_regs[] = {0, 1, 2, use_less_a_regs ? 0 : 3, use_less_a_regs ? 1 : 4, use_less_a_regs ? 2 : 5};
+  static constexpr int b_regs[] = {6, use_less_b_regs ? 6 : 7};
+  static constexpr int c_regs[] = {
+      8, 16, 24, 9, 17, 25, 10, 18, 26, 11, 19, 27, 12, 20, 28, 13, 21, 29, 14, 22, 30, 15, 23, 31,
+  };
+  static constexpr int alpha_load_reg = 0;
+  static constexpr int c_load_regs[] = {1, 2, 6};
+  static constexpr int a_shift = 128;
+  static constexpr int b_shift = 128;
+  static constexpr int nelems_in_cache_line = is_f32 ? 16 : 8;
+  static constexpr int a_prefetch_size = nelems_in_cache_line * 2;
+  static constexpr int b_prefetch_size = nelems_in_cache_line * 8;
+  vec zmm[32];
+  umask_t mask;
+  // gemm arguments.
+  Index m;
+  const Index n, k, ldc;
+  const Index inc;
+  const Scalar *alpha;
+  const Scalar *a, *b;
+  Scalar *c;
+  const bool is_alpha1;
+  const bool is_beta0;
+  const Index a_stride, b_stride;
+  const Index a_off, b_off;
+  EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
+    _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
+  }
+  EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar *b_addr) {
+    _mm_prefetch((char *)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
+  }
+  EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar *x_addr) { _mm_prefetch((char *)(x_addr - a_shift), _MM_HINT_T2); }
+  EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar *c_addr) {
+#if defined(__PRFCHW__) && __PRFCHW__ == 1
+    _m_prefetchw((void *)c_addr);
+#else
+    _mm_prefetch((char *)c_addr, _MM_HINT_T0);
+#endif
+  }
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void a_load(vec &a_reg, const Scalar *a_addr) {
+    switch (nelems * sizeof(*a_addr) * 8) {
+      default:
+      case 512 * 3:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 512 * 2:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 512 * 1:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 256 * 1:
+        a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double *>(a_addr))));
+        break;
+      case 128 * 1:
+        a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float *>(a_addr))));
+        break;
+      case 64 * 1:
+        a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double *>(a_addr)));
+        break;
+      case 32 * 1:
+        a_reg = pload1<vec>(a_addr);
+        break;
+    }
+  }
+  EIGEN_ALWAYS_INLINE void b_load(vec &b_reg, const Scalar *b_addr) { b_reg = pload1<vec>(b_addr); }
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void c_store(Scalar *mem, vec &src) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          pstoreu(mem, src);
+          break;
+        case 512 * 2:
+          pstoreu(mem, src);
+          break;
+        case 512 * 1:
+          pstoreu(mem, src);
+          break;
+        case 256 * 1:
+          pstoreu(mem, preinterpret<vec_ymm>(src));
+          break;
+        case 128 * 1:
+          pstoreu(mem, preinterpret<vec_xmm>(src));
+          break;
+        case 64 * 1:
+          pstorel(mem, preinterpret<vec_xmm>(src));
+          break;
+        case 32 * 1:
+          pstores(mem, preinterpret<vec_xmm>(src));
+          break;
+      }
+    } else {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          pscatter(mem, src, inc);
+          break;
+        case 512 * 2:
+          pscatter(mem, src, inc);
+          break;
+        case 512 * 1:
+          pscatter(mem, src, inc);
+          break;
+        case 256 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 128 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 64 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 32 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+      }
+    }
+  }
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void vaddm(vec &dst, const Scalar *mem, vec &src, vec &reg) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 512 * 2:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 512 * 1:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 256 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_ymm>(src), ploadu<vec_ymm>(mem)));
+          break;
+        case 128 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadu<vec_xmm>(mem)));
+          break;
+        case 64 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          break;
+        case 32 * 1:
+          dst = preinterpret<vec>(padds(preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    } else {
+      // Zero out scratch register
+      reg = pzero(reg);
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 512 * 2:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 512 * 1:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 256 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_ymm>(mem, inc));
+          dst = preinterpret<vec>(padd(preinterpret<vec_ymm>(src), preinterpret<vec_ymm>(reg)));
+          break;
+        case 128 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_xmm>(mem, inc));
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          break;
+        case 64 * 1:
+          if (is_f32) {
+            reg = pgather(reg, mem, inc, mask);
+            dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          } else {
+            dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          }
+          break;
+        case 32 * 1:
+          dst = preinterpret<vec>(padds(preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    }
+  }
+  EIGEN_STRONG_INLINE void vfmadd(vec &dst, const vec &src1, const vec &src2) {
+    dst = pmadd(src1, src2, dst);
+#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
+    // Workaround register spills for gcc and clang
+    __asm__("#" : [dst] "+v"(dst) : [src1] "%v"(src1), [src2] "v"(src2));
+#endif
+  }
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void vfmaddm(vec &dst, const Scalar *mem, vec &src, vec &scale, vec &reg) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 512 * 2:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 512 * 1:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 256 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_ymm>(scale), preinterpret<vec_ymm>(src), ploadu<vec_ymm>(mem)));
+          break;
+        case 128 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadu<vec_xmm>(mem)));
+          break;
+        case 64 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          break;
+        case 32 * 1:
+          dst =
+              preinterpret<vec>(pmadds(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    } else {
+      // Zero out scratch register
+      reg = pzero(reg);
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 512 * 2:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 512 * 1:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 256 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_ymm>(mem, inc));
+          dst = preinterpret<vec>(
+              pmadd(preinterpret<vec_ymm>(scale), preinterpret<vec_ymm>(src), preinterpret<vec_ymm>(reg)));
+          break;
+        case 128 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_xmm>(mem, inc));
+          dst = preinterpret<vec>(
+              pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          break;
+        case 64 * 1:
+          if (is_f32) {
+            reg = pgather(reg, mem, inc, mask);
+            dst = preinterpret<vec>(
+                pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          } else {
+            dst = preinterpret<vec>(
+                pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          }
+          break;
+        case 32 * 1:
+          dst =
+              preinterpret<vec>(pmadds(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    }
+  }
+  template <int j, int endX, int i, int endY, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar *ao) {
+    EIGEN_UNUSED_VARIABLE(ao);
+  }
+  template <int j, int endX, int i, int endY, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar *ao) {
+    if (j < endX) {
+      if (i < endY) {
+        auto &a_reg = zmm[a_regs[i + (j % 2) * 3]];
+        const Scalar *a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
+        a_load<nelems>(a_reg, a_addr);
+        a_loads<j, endX, i + 1, endY, nelems>(ao);
+      } else {
+        a_loads<j + 1, endX, 0, endY, nelems>(ao);
+      }
+    }
+  }
+  template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar *co1,
+                                                                                         const Scalar *co2) {
+    EIGEN_UNUSED_VARIABLE(co1);
+    EIGEN_UNUSED_VARIABLE(co2);
+  }
+  /* C prefetch loop structure.
+   * for (int un = 0; un < 8; un++) {
+   *     if (b_unroll >= un + 1) {
+   *         if (un == 4) co2 = co1 + 4 * ldc;
+   *
+   *         for (int i = 0; i < um_vecs; i++) {
+   *             Scalar *co = (un + 1 <= 4) ? co1 : co2;
+   *             auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
+   *             prefetch_c(co + co_off);
+   *         }
+   *     }
+   * }
+   */
+  template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar *&co1, Scalar *&co2) {
+    if (un < max_b_unroll) {
+      if (b_unroll >= un + 1) {
+        if (un == 4 && i == 0) co2 = co1 + 4 * ldc;
+        if (i < um_vecs) {
+          Scalar *co = (un + 1 <= 4) ? co1 : co2;
+          auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
+          prefetch_c(co + co_off);
+          prefetch_cs<un, max_b_unroll, i + 1, um_vecs, a_unroll, b_unroll>(co1, co2);
+        } else {
+          prefetch_cs<un + 1, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+        }
+      } else {
+        prefetch_cs<un + 1, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+      }
+    }
+  }
+  // load_c
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+    EIGEN_UNUSED_VARIABLE(cox);
+    EIGEN_UNUSED_VARIABLE(alpha_reg);
+  }
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+    if (i < um_vecs) {
+      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto &c_load_reg = zmm[c_load_regs[i % 3]];
+      auto c_mem = cox;
+      if (is_unit_inc)
+        c_mem += i * nelems_in_cache_line;
+      else
+        c_mem += i * nelems_in_cache_line * inc;
+      if (!is_beta0 && is_alpha1)
+        vaddm<nelems>(c_reg, c_mem, c_reg, c_load_reg);
+      else if (!is_beta0 && !is_alpha1)
+        vfmaddm<nelems>(c_reg, c_mem, c_reg, alpha_reg, c_load_reg);
+      else if (is_beta0 && !is_alpha1)
+        c_reg = pmul(alpha_reg, c_reg);
+      scale_load_c<i + 1, um_vecs, idx, nelems>(cox, alpha_reg);
+    }
+  }
+  // store_c
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar *cox) {
+    EIGEN_UNUSED_VARIABLE(cox);
+  }
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar *cox) {
+    if (i < um_vecs) {
+      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto c_mem = cox;
+      if (is_unit_inc)
+        c_mem += i * nelems_in_cache_line;
+      else
+        c_mem += i * nelems_in_cache_line * inc;
+      c_store<nelems>(c_mem, c_reg);
+      c_reg = pzero(c_reg);
+      write_c<i + 1, um_vecs, idx, nelems>(cox);
+    }
+  }
+  /*  C update loop structure.
+   *  co2 = co1 + ldc;
+   *
+   *  auto &alpha_reg = zmm[alpha_load_reg];
+   *  if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
+   *
+   *  int idx = 0;
+   *  for (pow = 1; pow <= 8; pow <<= 1) {
+   *
+   *      if (b_unroll >= pow) {
+   *          for (count = 1; count < (pow + 1) / 2 + 1;  count++) {
+   *              if (pow >= 4) co2 += ldc;
+   *
+   *              const Scalar *cox = (idx == 0) ? co1 : co2;
+   *
+   *              const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+   *              scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
+   *              write_c<0, um_vecs, idx, a_unroll>(cox);
+   *
+   *              idx++;
+   *          }
+   *      }
+   *  }
+   *
+   *  if (b_unroll == 1)
+   *      co1 += ldc;
+   *  else
+   *      co1 = co2 + ldc;
+   */
+  template <int pow, int a_unroll, int idx>
+  EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
+    if (pow >= 4) cox += ldc;
+    const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+    auto &alpha_reg = zmm[alpha_load_reg];
+    scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
+    write_c<0, um_vecs, idx, a_unroll>(cox);
+  }
+  template <int pow, int a_unroll>
+  EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar *&co1, Scalar *&co2) {
+    constexpr int idx = pow / 2;
+    Scalar *&cox = idx == 0 ? co1 : co2;
+    constexpr int max_count = (pow + 1) / 2;
+    static_assert(max_count <= 4, "Unsupported max_count.");
+    if (1 <= max_count) c_update_1count<pow, a_unroll, idx + 0>(cox);
+    if (2 <= max_count) c_update_1count<pow, a_unroll, idx + 1>(cox);
+    if (3 <= max_count) c_update_1count<pow, a_unroll, idx + 2>(cox);
+    if (4 <= max_count) c_update_1count<pow, a_unroll, idx + 3>(cox);
+  }
+  template <int max_b_unroll, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE void c_update(Scalar *&co1, Scalar *&co2) {
+    auto &alpha_reg = zmm[alpha_load_reg];
+    co2 = co1 + ldc;
+    if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
+    if (!is_unit_inc && a_unroll < nelems_in_cache_line) mask = static_cast<umask_t>((1ull << a_unroll) - 1);
+    static_assert(max_b_unroll <= 8, "Unsupported max_b_unroll");
+    if (1 <= max_b_unroll && 1 <= b_unroll) c_update_1pow<1, a_unroll>(co1, co2);
+    if (2 <= max_b_unroll && 2 <= b_unroll) c_update_1pow<2, a_unroll>(co1, co2);
+    if (4 <= max_b_unroll && 4 <= b_unroll) c_update_1pow<4, a_unroll>(co1, co2);
+    if (8 <= max_b_unroll && 8 <= b_unroll) c_update_1pow<8, a_unroll>(co1, co2);
+    if (b_unroll == 1)
+      co1 += ldc;
+    else
+      co1 = co2 + ldc;
+  }
+  // compute
+  template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
+                                                               int &fetchB_idx, vec &b_reg) {
+    EIGEN_UNUSED_VARIABLE(ao);
+    EIGEN_UNUSED_VARIABLE(bo);
+    EIGEN_UNUSED_VARIABLE(fetchA_idx);
+    EIGEN_UNUSED_VARIABLE(fetchB_idx);
+    EIGEN_UNUSED_VARIABLE(b_reg);
+  }
+  template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
+                                                                int &fetchB_idx, vec &b_reg) {
+    if (um < um_vecs) {
+      auto &c_reg = zmm[c_regs[um + idx * 3]];
+      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+      vfmadd(c_reg, a_reg, b_reg);
+      if (!fetch_x && um == 0 &&
+          (((idx == 0 || idx == 6) && (uk % 2 == 0 || is_f64 || ktail)) ||
+           (idx == 3 && (uk % 2 == 1 || is_f64 || ktail)))) {
+        prefetch_a(ao + nelems_in_cache_line * fetchA_idx);
+        fetchA_idx++;
+      }
+      if (um == 0 && idx == 1 && (uk % 2 == 0 || is_f64 || ktail)) {
+        prefetch_b(bo + nelems_in_cache_line * fetchB_idx);
+        fetchB_idx++;
+      }
+      compute<um + 1, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+    }
+  }
+  // load_a
+  template <int um, int um_vecs, int uk, int nelems, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar *ao) {
+    EIGEN_UNUSED_VARIABLE(ao);
+  }
+  template <int um, int um_vecs, int uk, int nelems, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar *ao) {
+    if (um < um_vecs) {
+      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+      const Scalar *a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
+      a_load<nelems>(a_reg, a_addr);
+      load_a<um + 1, um_vecs, uk, nelems, ktail>(ao);
+    }
+  }
+  template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
+                                                                                 const Scalar *const &ao,
+                                                                                 const Scalar *const &bo, Scalar *&co2,
+                                                                                 int &fetchA_idx, int &fetchB_idx) {
+    EIGEN_UNUSED_VARIABLE(aa);
+    EIGEN_UNUSED_VARIABLE(ao);
+    EIGEN_UNUSED_VARIABLE(bo);
+    EIGEN_UNUSED_VARIABLE(co2);
+    EIGEN_UNUSED_VARIABLE(fetchA_idx);
+    EIGEN_UNUSED_VARIABLE(fetchB_idx);
+  }
+  template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
+                                                                                  const Scalar *const &ao,
+                                                                                  const Scalar *const &bo, Scalar *&co2,
+                                                                                  int &fetchA_idx, int &fetchB_idx) {
+    const int idx = (pow / 2) + count;
+    if (count < (pow + 1) / 2) {
+      auto &b_reg = zmm[b_regs[idx % 2]];
+      if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
+      if (fetch_x && uk == 3 && idx == 4) aa += 8;
+      if (b_unroll >= pow) {
+        compute<0, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+        const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
+        b_load(b_reg, b_addr);
+      }
+      // Go to the next count.
+      innerkernel_1pow<uk, pow, count + 1, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                       fetchB_idx);
+    } else {
+      // Maybe prefetch C data after count-loop.
+      if (pow == 2 && c_fetch) {
+        if (uk % 3 == 0 && uk > 0) {
+          co2 += ldc;
+        } else {
+          prefetch_c(co2 + (uk % 3) * nelems_in_cache_line);
+        }
+      }
+    }
+  }
+  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch,
+            bool no_a_preload = false>
+  EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
+                                           Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
+    const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+    if (max_b_unroll >= 1)
+      innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 2)
+      innerkernel_1pow<uk, 2, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 4)
+      innerkernel_1pow<uk, 4, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 8)
+      innerkernel_1pow<uk, 8, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    // Load A after pow-loop. Skip this at the end to prevent running over the buffer
+    if (!no_a_preload) load_a<0, um_vecs, uk, a_unroll, ktail>(ao);
+  }
+  /*  Inner kernel loop structure.
+   *  for (int uk = 0; uk < kfactor; uk++) {
+   *      int idx = 0;
+   *
+   *      for (pow = 1; pow < max_b_unroll << 1; pow <<= 1) {
+   *          for (int count = 0; count < (pow + 1) / 2; count++) {
+   *              auto &b_reg = zmm[b_regs[idx % 2]];
+   *
+   *              if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
+   *              if (fetch_x && uk == 3 && idx == 4) aa += 8;
+   *
+   *              if (b_unroll >= pow) {
+   *                  compute<0, um_vecs, idx, uk, fetchx, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+   *
+   *                  const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) - b_shift ;
+   *                  b_load(b_reg, b_addr);
+   *              }
+   *              idx++;
+   *          }
+   *
+   *          Maybe prefetch C data.
+   *          if (pow == 2 && c_fetch) {
+   *              if (uk % 3 == 0 && uk > 0) {
+   *                  co2 += ldc;
+   *              } else {
+   *                  prefetch_c(co2 + (uk % 3) * nelems_in_cache_line);
+   *              }
+   *          }
+   *      }
+   *
+   *      Load A.
+   *      load_a<0, um_vecs, uk, ktail, a_unroll>(ao);
+   *  }
+   *
+   *  Advance A/B pointers after uk-loop.
+   *  ao += a_unroll * kfactor;
+   *  bo += b_unroll * kfactor;
+   */
+  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch,
+            bool no_a_preload = false>
+  EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
+    int fetchA_idx = 0;
+    int fetchB_idx = 0;
+    const bool fetch_x = k_factor == max_k_factor;
+    const bool ktail = k_factor == 1;
+    static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4");
+    static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1),
+                  "skipping a preload only allowed when k unroll is 1");
+    if (k_factor > 0)
+      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (k_factor > 1)
+      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (k_factor > 2)
+      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (k_factor > 3)
+      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    // Advance A/B pointers after uk-loop.
+    ao += a_unroll * k_factor;
+    bo += b_unroll * k_factor;
+  }
+  template <int a_unroll, int b_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+    if (!use_less_a_regs && k > 1)
+      a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
+    else
+      a_loads<0, 1, 0, um_vecs, a_unroll>(ao);
+    b_load(zmm[b_regs[0]], bo - b_shift + 0);
+    if (!use_less_b_regs) b_load(zmm[b_regs[1]], bo - b_shift + 1);
+#ifndef SECOND_FETCH
+    prefetch_cs<0, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+#endif  // SECOND_FETCH
+    // Unrolling k-loop by a factor of 4.
+    const int max_k_factor = 4;
+    Index kRem = k % max_k_factor;
+    Index k_ = k - kRem;
+    if (k_ >= max_k_factor) {
+      k_ -= max_k_factor;
+      kRem += max_k_factor;
+    }
+    Index loop_count = k_ / max_k_factor;
+    if (loop_count > 0) {
+#ifdef SECOND_FETCH
+      loop_count -= SECOND_FETCH;
+#endif
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+        loop_count--;
+      }
+#ifdef SECOND_FETCH
+      co2 = co1 + nelems_in_cache_line - 1;
+      loop_count += b_unroll;
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 1>(aa, ao, bo, co2);
+        loop_count--;
+      }
+      loop_count += SECOND_FETCH - b_unroll;
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+        loop_count--;
+      }
+#endif
+    }
+    // k-loop remainder handling.
+    loop_count = kRem;
+    while (loop_count > 1) {
+      innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+      loop_count--;
+    }
+    if (loop_count > 0) {
+      innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0, true>(aa, ao, bo, co2);
+    }
+    // Update C matrix.
+    c_update<max_b_unroll, a_unroll, b_unroll>(co1, co2);
+  }
+  template <int a_unroll, int b_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void nloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    // Set A matrix pointer.
+    ao = a + a_off * a_unroll;
+    // Set B matrix pointer if needed.
+    bo += b_unroll * b_off;
+    kloop<a_unroll, b_unroll, max_b_unroll>(aa, ao, bo, co1, co2);
+    // Advance B matrix pointer if needed.
+    bo += b_unroll * (b_stride - k - b_off);
+    // Advance prefetch A pointer.
+    aa += 16;
+  }
+  template <int a_unroll, int max_a_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void mloop(const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    // Set prefetch A pointers.
+    const Scalar *aa = a + a_unroll * a_stride;
+    // Set C matrix pointers.
+    co1 = c;
+    if (a_unroll >= max_a_unroll) co2 = c + 2 * ldc;
+    if (is_unit_inc)
+      c += a_unroll;
+    else
+      c += a_unroll * inc;
+    // Set B matrix pointer.
+    bo = b;
+    // Main n-loop.
+    for (Index i = n / max_b_unroll; i > 0; i--) nloop<a_unroll, max_b_unroll, max_b_unroll>(aa, ao, bo, co1, co2);
+    // n-remainders.
+    if (n & 4 && max_b_unroll > 4) nloop<a_unroll, 4, max_b_unroll>(aa, ao, bo, co1, co2);
+#if 0
+        if (n & 2 && max_b_unroll > 2) nloop<a_unroll, 2, max_b_unroll>(aa, ao, bo, co1, co2);
+        if (n & 1 && max_b_unroll > 1) nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
+#else
+    // Copy kernels don't support tails of n = 2 for single/double precision.
+    // Loop over ones.
+    int n_rem = 2 * ((n & 2) != 0) + 1 * ((n & 1) != 0);
+    while (n_rem > 0) {
+      nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
+      n_rem--;
+    }
+#endif
+    // Advance A matrix pointer.
+    a = ao + a_unroll * (a_stride - k - a_off);
+  }
+ public:
+  // Compute kernel unrolling C matrix by max_a_unroll x max_b_unroll.
+  template <int max_a_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void compute_kern() {
+    a -= -a_shift;
+    b -= -b_shift;
+    const Scalar *ao = nullptr;
+    const Scalar *bo = nullptr;
+    Scalar *co1 = nullptr;
+    Scalar *co2 = nullptr;
+    // Main m-loop.
+    for (; m >= max_a_unroll; m -= max_a_unroll) mloop<max_a_unroll, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    // m-remainders.
+    if (m & 32 && max_a_unroll > 32) mloop<32, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 16 && max_a_unroll > 16) mloop<16, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 8 && max_a_unroll > 8) mloop<8, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 4 && max_a_unroll > 4) mloop<4, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 2 && max_a_unroll > 2 && is_f64) mloop<2, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 1 && max_a_unroll > 1 && is_f64) mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    // Copy kernels don't support tails of m = 2 for single precision.
+    // Loop over ones.
+    if (is_f32) {
+      int m_rem = 2 * ((m & 2) != 0) + 1 * ((m & 1) != 0);
+      while (m_rem > 0) {
+        mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+        m_rem--;
+      }
+    }
+  }
+  gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar *alpha_, const Scalar *a_,
+             const Scalar *b_, Scalar *c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
+             Index a_off_, Index b_off_)
+      : m(m_),
+        n(n_),
+        k(k_),
+        ldc(ldc_),
+        inc(inc_),
+        alpha(alpha_),
+        a(a_),
+        b(b_),
+        c(c_),
+        is_alpha1(is_alpha1_),
+        is_beta0(is_beta0_),
+        a_stride(a_stride_),
+        b_stride(b_stride_),
+        a_off(a_off_),
+        b_off(b_off_) {
+    // Zero out all accumulation registers.
+    zmm[8] = pzero(zmm[8]);
+    zmm[9] = pzero(zmm[9]);
+    zmm[10] = pzero(zmm[10]);
+    zmm[11] = pzero(zmm[11]);
+    zmm[12] = pzero(zmm[12]);
+    zmm[13] = pzero(zmm[13]);
+    zmm[14] = pzero(zmm[14]);
+    zmm[15] = pzero(zmm[15]);
+    zmm[16] = pzero(zmm[16]);
+    zmm[17] = pzero(zmm[17]);
+    zmm[18] = pzero(zmm[18]);
+    zmm[19] = pzero(zmm[19]);
+    zmm[20] = pzero(zmm[20]);
+    zmm[21] = pzero(zmm[21]);
+    zmm[22] = pzero(zmm[22]);
+    zmm[23] = pzero(zmm[23]);
+    zmm[24] = pzero(zmm[24]);
+    zmm[25] = pzero(zmm[25]);
+    zmm[26] = pzero(zmm[26]);
+    zmm[27] = pzero(zmm[27]);
+    zmm[28] = pzero(zmm[28]);
+    zmm[29] = pzero(zmm[29]);
+    zmm[30] = pzero(zmm[30]);
+    zmm[31] = pzero(zmm[31]);
+  }
+};
+// Compute kernel with max unroll support of:
+//   Single precision:
+//     max_a_unroll: 48, 32, 16, 8, 4, 2, 1
+//     max_b_unroll: 8, 4, 2, 1
+//   Double precision:
+//     max_a_unroll: 24, 16, 8, 4, 2, 1
+//     max_b_unroll: 8, 4, 2, 1
+template <typename Scalar, int max_a_unroll, int max_b_unroll, bool is_alpha1, bool is_beta0, bool is_unit_inc>
+EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha, const Scalar *a, const Scalar *b,
+                                        Scalar *c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
+                                        Index a_off = 0, Index b_off = 0) {
+  if (a_stride == -1) a_stride = k;
+  if (b_stride == -1) b_stride = k;
+  gemm_class<Scalar, is_unit_inc> g(m, n, k, ldc, inc, alpha, a, b, c, is_alpha1, is_beta0, a_stride, b_stride, a_off,
+                                    b_off);
+  g.template compute_kern<max_a_unroll, max_b_unroll>();
+}
+// Template specializations of GEBP kernels with nr = 8.
+#if EIGEN_USE_AVX512_GEMM_KERNELS
+template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
+class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
+  using Base = gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_>;
+ public:
+  enum { nr = Base::Vectorizable ? 8 : 4 };
+};
+template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
+class gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
+  using Base = gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_>;
+ public:
+  enum { nr = Base::Vectorizable ? 8 : 4 };
+};
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode>::operator()(
+    Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset) {
+  constexpr int nr = 8;
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+  Index count = 0;
+  const Index peeled_k = (depth / PacketSize) * PacketSize;
+  if (nr >= 8) {
+    for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+      // skip what we have before
+      if (PanelMode) count += 8 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+      const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
+      const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
+      const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
+      const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
+      Index k = 0;
+      if ((PacketSize % 8) == 0)  // TODO enable vectorized transposition for PacketSize==4
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 8) == 0 ? 8 : PacketSize> kernel;
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3] = dm3.template loadPacket<Packet>(k);
+          kernel.packet[4] = dm4.template loadPacket<Packet>(k);
+          kernel.packet[5] = dm5.template loadPacket<Packet>(k);
+          kernel.packet[6] = dm6.template loadPacket<Packet>(k);
+          kernel.packet[7] = dm7.template loadPacket<Packet>(k);
+          ptranspose(kernel);
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel.packet[4 % PacketSize]));
+          pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel.packet[5 % PacketSize]));
+          pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel.packet[6 % PacketSize]));
+          pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel.packet[7 % PacketSize]));
+          count += 8 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        blockB[count + 4] = cj(dm4(k));
+        blockB[count + 5] = cj(dm5(k));
+        blockB[count + 6] = cj(dm6(k));
+        blockB[count + 7] = cj(dm7(k));
+        count += 8;
+      }
+      // skip what we have after
+      if (PanelMode) count += 8 * (stride - offset - depth);
+    }
+  }
+  if (nr >= 4) {
+    for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+      // skip what we have before
+      if (PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+      Index k = 0;
+      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
+          ptranspose(kernel);
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          count += 4 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if (PanelMode) count += 4 * (stride - offset - depth);
+    }
+  }
+  // copy the remaining columns one at a time (nr==1)
+  for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+    if (PanelMode) count += offset;
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+    for (Index k = 0; k < depth; k++) {
+      blockB[count] = cj(dm0(k));
+      count += 1;
+    }
+    if (PanelMode) count += (stride - offset - depth);
+  }
+}
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum {
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    QuarterPacketSize = unpacket_traits<QuarterPacket>::size
+  };
+  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) {
+    constexpr int nr = 8;
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+    Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+    Index count = 0;
+    if (nr >= 8) {
+      for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+        // skip what we have before
+        if (PanelMode) count += 8 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 8) {
+            // Packet A = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2]);
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (HasHalf && HalfPacketSize == 8) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (HasQuarter && QuarterPacketSize == 8) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (PacketSize == 4) {
+            // Packet A = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2]);
+            // Packet B = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2 + PacketSize]);
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            Packet B = rhs.template loadPacket<Packet>(k, j2 + PacketSize);
+            pstoreu(blockB + count, cj.pconj(A));
+            pstoreu(blockB + count + PacketSize, cj.pconj(B));
+          } else {
+            // const Scalar* b0 = &rhs.data()[k*rhs.stride() + j2];
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            blockB[count + 4] = cj(dm0(4));
+            blockB[count + 5] = cj(dm0(5));
+            blockB[count + 6] = cj(dm0(6));
+            blockB[count + 7] = cj(dm0(7));
+          }
+          count += 8;
+        }
+        // skip what we have after
+        if (PanelMode) count += 8 * (stride - offset - depth);
+      }
+    }
+    if (nr >= 4) {
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        // skip what we have before
+        if (PanelMode) count += 4 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize == 4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize == 4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            count += 4;
+          }
+        }
+        // skip what we have after
+        if (PanelMode) count += 4 * (stride - offset - depth);
+      }
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      if (PanelMode) count += offset;
+      for (Index k = 0; k < depth; k++) {
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
+      }
+      if (PanelMode) count += stride - offset - depth;
+    }
+  }
+};
+template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
+  EIGEN_ALWAYS_INLINE void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows,
+                                      Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
+                                      Index offsetA = 0, Index offsetB = 0);
+};
+template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols,
+    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  if (res.incr() == 1) {
+    if (alpha == 1) {
+      gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB,
+                                                         (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                         strideB, offsetA, offsetB);
+    } else {
+      gemm_kern_avx512<Scalar, mr, 8, false, false, true>(rows, cols, depth, &alpha, blockA, blockB,
+                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          strideB, offsetA, offsetB);
+    }
+  } else {
+    if (alpha == 1) {
+      gemm_kern_avx512<Scalar, mr, 8, true, false, false>(rows, cols, depth, &alpha, blockA, blockB,
+                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          strideB, offsetA, offsetB);
+    } else {
+      gemm_kern_avx512<Scalar, mr, 8, false, false, false>(rows, cols, depth, &alpha, blockA, blockB,
+                                                           (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                           strideB, offsetA, offsetB);
+    }
+  }
+}
+#endif  // EIGEN_USE_AVX512_GEMM_KERNELS
+}  // namespace internal
+}  // namespace Eigen
+#undef SECOND_FETCH
+#endif  // EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H