@smake/eigen 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -21
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +235 -326
- package/eigen/Eigen/Eigenvalues +16 -14
- package/eigen/Eigen/Geometry +21 -24
- package/eigen/Eigen/Householder +9 -8
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -14
- package/eigen/Eigen/KLUSupport +43 -0
- package/eigen/Eigen/LU +16 -20
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -54
- package/eigen/Eigen/PaStiXSupport +23 -20
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -21
- package/eigen/Eigen/QtAlignedMalloc +5 -13
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -18
- package/eigen/Eigen/Sparse +1 -4
- package/eigen/Eigen/SparseCholesky +18 -23
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +12 -8
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
- package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
- package/eigen/Eigen/src/Core/Array.h +341 -294
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
- package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
- package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
- package/eigen/Eigen/src/Core/Block.h +375 -398
- package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
- package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
- package/eigen/Eigen/src/Core/DenseBase.h +632 -571
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
- package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +169 -210
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +172 -222
- package/eigen/Eigen/src/Core/EigenBase.h +75 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
- package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
- package/eigen/Eigen/src/Core/IO.h +147 -139
- package/eigen/Eigen/src/Core/IndexedView.h +321 -0
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +56 -66
- package/eigen/Eigen/src/Core/Map.h +124 -142
- package/eigen/Eigen/src/Core/MapBase.h +256 -281
- package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
- package/eigen/Eigen/src/Core/Matrix.h +491 -416
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
- package/eigen/Eigen/src/Core/NestByValue.h +66 -85
- package/eigen/Eigen/src/Core/NoAlias.h +79 -85
- package/eigen/Eigen/src/Core/NumTraits.h +235 -148
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
- package/eigen/Eigen/src/Core/Product.h +260 -139
- package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
- package/eigen/Eigen/src/Core/Random.h +161 -136
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +366 -336
- package/eigen/Eigen/src/Core/Ref.h +308 -209
- package/eigen/Eigen/src/Core/Replicate.h +94 -106
- package/eigen/Eigen/src/Core/Reshaped.h +398 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
- package/eigen/Eigen/src/Core/Reverse.h +136 -145
- package/eigen/Eigen/src/Core/Select.h +70 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +97 -111
- package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
- package/eigen/Eigen/src/Core/SolverBase.h +138 -101
- package/eigen/Eigen/src/Core/StableNorm.h +156 -160
- package/eigen/Eigen/src/Core/StlIterators.h +619 -0
- package/eigen/Eigen/src/Core/Stride.h +91 -88
- package/eigen/Eigen/src/Core/Swap.h +70 -38
- package/eigen/Eigen/src/Core/Transpose.h +295 -273
- package/eigen/Eigen/src/Core/Transpositions.h +272 -317
- package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
- package/eigen/Eigen/src/Core/Visitor.h +480 -216
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
- package/eigen/Eigen/src/Core/util/Constants.h +314 -263
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
- package/eigen/Eigen/src/Core/util/Macros.h +939 -646
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
- package/eigen/Eigen/src/Core/util/Meta.h +618 -426
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
- package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
- package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
- package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
- package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
- package/eigen/Eigen/src/Geometry/Transform.h +896 -953
- package/eigen/Eigen/src/Geometry/Translation.h +100 -98
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
- package/eigen/Eigen/src/Householder/Householder.h +104 -122
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
- package/eigen/Eigen/src/LU/Determinant.h +60 -63
- package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
- package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
- package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
- package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
- package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
#ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
|
|
2
|
+
#define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
|
|
3
|
+
|
|
4
|
+
#if EIGEN_COMP_LLVM
|
|
5
|
+
#define BFLOAT16_UNROLL _Pragma("unroll 8")
|
|
6
|
+
#else
|
|
7
|
+
#define BFLOAT16_UNROLL _Pragma("GCC unroll(8)")
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
namespace Eigen {
|
|
11
|
+
|
|
12
|
+
namespace internal {
|
|
13
|
+
|
|
14
|
+
template <bool zero>
|
|
15
|
+
EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA) {
|
|
16
|
+
Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
|
|
17
|
+
if (zero) {
|
|
18
|
+
Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
|
|
19
|
+
return vec_mergeh(lhs1.m_val, lhs2.m_val);
|
|
20
|
+
} else {
|
|
21
|
+
return lhs1;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
template <bool zero>
|
|
26
|
+
EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB, Index strideB, Index i) {
|
|
27
|
+
return loadBfloat16<zero>(blockB + strideB * i);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
template <Index num_acc, Index num_packets, bool zero, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs,
|
|
31
|
+
Index num_lhs>
|
|
32
|
+
EIGEN_ALWAYS_INLINE void KLoop(const bfloat16* indexA, const bfloat16* indexB, __vector_quad (&quad_acc)[num_acc],
|
|
33
|
+
Index strideB, Index k, Index offsetB, Index extra_cols, Index extra_rows) {
|
|
34
|
+
Packet8bf lhs[num_lhs], rhs[num_rhs];
|
|
35
|
+
|
|
36
|
+
BFLOAT16_UNROLL
|
|
37
|
+
for (Index i = 0; i < (num_rhs - (rhsExtraCols ? 1 : 0)); i++) {
|
|
38
|
+
rhs[i] = loadRhsBfloat16<zero>(indexB + k * 4, strideB, i);
|
|
39
|
+
}
|
|
40
|
+
if (rhsExtraCols) {
|
|
41
|
+
rhs[num_rhs - 1] = loadRhsBfloat16<zero>(indexB + k * extra_cols - offsetB, strideB, num_rhs - 1);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
indexA += k * (lhsExtraRows ? extra_rows : num_packets);
|
|
45
|
+
if (num_lhs == 1) {
|
|
46
|
+
lhs[0] = loadBfloat16<zero>(indexA);
|
|
47
|
+
} else {
|
|
48
|
+
BFLOAT16_UNROLL
|
|
49
|
+
for (Index j = 0; j < num_lhs; j += 2) {
|
|
50
|
+
Packet8bf lhs1 = ploadu<Packet8bf>(indexA + (j + 0) * (zero ? 4 : 8));
|
|
51
|
+
if (zero) {
|
|
52
|
+
Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
|
|
53
|
+
lhs[j + 0] = vec_mergeh(lhs1.m_val, lhs2.m_val);
|
|
54
|
+
lhs[j + 1] = vec_mergel(lhs1.m_val, lhs2.m_val);
|
|
55
|
+
} else {
|
|
56
|
+
lhs[j + 0] = lhs1;
|
|
57
|
+
lhs[j + 1] = ploadu<Packet8bf>(indexA + (j + 1) * 8);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
BFLOAT16_UNROLL
|
|
63
|
+
for (Index i = 0, x = 0; i < num_rhs; i++) {
|
|
64
|
+
BFLOAT16_UNROLL
|
|
65
|
+
for (Index j = 0; j < num_lhs; j++, x++) {
|
|
66
|
+
__builtin_mma_xvbf16ger2pp(&(quad_acc[x]), reinterpret_cast<Packet16uc>(rhs[i].m_val),
|
|
67
|
+
reinterpret_cast<Packet16uc>(lhs[j].m_val));
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
template <Index num_acc>
|
|
73
|
+
EIGEN_ALWAYS_INLINE void zeroAccumulators(__vector_quad (&quad_acc)[num_acc]) {
|
|
74
|
+
BFLOAT16_UNROLL
|
|
75
|
+
for (Index k = 0; k < num_acc; k++) __builtin_mma_xxsetaccz(&(quad_acc[k]));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
template <Index num_acc>
|
|
79
|
+
EIGEN_ALWAYS_INLINE void disassembleAccumulators(__vector_quad (&quad_acc)[num_acc], Packet4f (&acc)[num_acc][4]) {
|
|
80
|
+
BFLOAT16_UNROLL
|
|
81
|
+
for (Index k = 0; k < num_acc; k++) __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
|
|
85
|
+
EIGEN_ALWAYS_INLINE void outputResults(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
|
|
86
|
+
const Index extra_cols, Index extra_rows) {
|
|
87
|
+
BFLOAT16_UNROLL
|
|
88
|
+
for (Index i = 0, k = 0; i < num_rhs - (rhsExtraCols ? 1 : 0); i++, result += 4 * rows) {
|
|
89
|
+
BFLOAT16_UNROLL
|
|
90
|
+
for (Index j = 0; j < num_lhs; j++, k++) {
|
|
91
|
+
storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result + j * 4, extra_cols, extra_rows);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (rhsExtraCols) {
|
|
95
|
+
storeResults<rhsExtraCols, lhsExtraRows>(acc[num_acc - 1], rows, pAlpha, result, extra_cols, extra_rows);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows, bool multiIter = false>
|
|
100
|
+
EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
|
|
101
|
+
const bfloat16* indexB, Index strideB, Index offsetB, float* result,
|
|
102
|
+
const Index extra_cols, const Index extra_rows) {
|
|
103
|
+
constexpr Index num_lhs = multiIter ? (num_packets / 4) : 1;
|
|
104
|
+
constexpr Index num_rhs = (num_acc + num_lhs - 1) / num_lhs;
|
|
105
|
+
|
|
106
|
+
for (Index offset_row = 0; offset_row < num_packets; offset_row += 4, indexA += (multiIter ? 0 : 8),
|
|
107
|
+
indexB += (multiIter ? (num_rhs * strideB) : 0), result += (multiIter ? (4 * rows * num_rhs) : 4)) {
|
|
108
|
+
Packet4f acc[num_acc][4];
|
|
109
|
+
__vector_quad quad_acc[num_acc];
|
|
110
|
+
|
|
111
|
+
zeroAccumulators<num_acc>(quad_acc);
|
|
112
|
+
|
|
113
|
+
Index k;
|
|
114
|
+
for (k = 0; k + 2 <= depth; k += 2) {
|
|
115
|
+
KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
|
|
116
|
+
indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
|
|
117
|
+
}
|
|
118
|
+
if (depth & 1) {
|
|
119
|
+
KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
|
|
120
|
+
indexA - (multiIter ? 0 : offset_row), indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
disassembleAccumulators<num_acc>(quad_acc, acc);
|
|
124
|
+
|
|
125
|
+
outputResults<num_acc, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(acc, rows, pAlpha, result, extra_cols,
|
|
126
|
+
extra_rows);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
#define MAX_BFLOAT16_ACC 8
|
|
131
|
+
|
|
132
|
+
template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
|
|
133
|
+
void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
|
|
134
|
+
const bfloat16* indexB, Index strideB, Index offsetB, float* result) {
|
|
135
|
+
constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
|
|
136
|
+
const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
|
|
137
|
+
const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
|
|
138
|
+
constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC);
|
|
139
|
+
constexpr bool normIters = multiIters && ((num_acc % (num_packets / 4)) == 0);
|
|
140
|
+
|
|
141
|
+
do {
|
|
142
|
+
colLoopBodyIter<num_acc, num_packets, rhsExtraCols, lhsExtraRows, normIters>(
|
|
143
|
+
depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
|
|
144
|
+
|
|
145
|
+
indexB += strideB * num_acc;
|
|
146
|
+
result += rows * step;
|
|
147
|
+
} while (multiIters && (step <= cols - (col += step)));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
|
|
151
|
+
EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
|
|
152
|
+
const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB,
|
|
153
|
+
float* result) {
|
|
154
|
+
if (MAX_BFLOAT16_ACC > num_acc) {
|
|
155
|
+
colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols, lhsExtraRows>(
|
|
156
|
+
col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
template <const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
|
|
161
|
+
void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
|
|
162
|
+
const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
|
|
163
|
+
switch ((cols - col) >> 2) {
|
|
164
|
+
case 7:
|
|
165
|
+
colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
166
|
+
strideB, offsetB, result);
|
|
167
|
+
break;
|
|
168
|
+
case 6:
|
|
169
|
+
colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
170
|
+
strideB, offsetB, result);
|
|
171
|
+
break;
|
|
172
|
+
case 5:
|
|
173
|
+
colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
174
|
+
strideB, offsetB, result);
|
|
175
|
+
break;
|
|
176
|
+
case 4:
|
|
177
|
+
colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
178
|
+
strideB, offsetB, result);
|
|
179
|
+
break;
|
|
180
|
+
case 3:
|
|
181
|
+
colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
182
|
+
strideB, offsetB, result);
|
|
183
|
+
break;
|
|
184
|
+
case 2:
|
|
185
|
+
colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
186
|
+
strideB, offsetB, result);
|
|
187
|
+
break;
|
|
188
|
+
case 1:
|
|
189
|
+
colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
190
|
+
strideB, offsetB, result);
|
|
191
|
+
break;
|
|
192
|
+
default:
|
|
193
|
+
if (rhsExtraCols) {
|
|
194
|
+
colLoopBody<1, num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
|
|
195
|
+
offsetB, result);
|
|
196
|
+
}
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
template <const Index num_packets, bool lhsExtraRows = false>
|
|
202
|
+
EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
|
|
203
|
+
const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
|
|
204
|
+
Index col = 0;
|
|
205
|
+
if (cols >= (MAX_BFLOAT16_ACC * 4)) {
|
|
206
|
+
colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
|
|
207
|
+
strideB, 0, result);
|
|
208
|
+
blockB += (strideB >> 2) * col;
|
|
209
|
+
result += rows * col;
|
|
210
|
+
}
|
|
211
|
+
if (cols & 3) {
|
|
212
|
+
colLoopBodyExtra<num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
|
|
213
|
+
result);
|
|
214
|
+
} else {
|
|
215
|
+
colLoopBodyExtra<num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0,
|
|
216
|
+
result);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16(const float* res) {
|
|
221
|
+
Packet16uc fp16[2];
|
|
222
|
+
__vector_pair fp16_vp = *reinterpret_cast<__vector_pair*>(const_cast<float*>(res));
|
|
223
|
+
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(fp16), &fp16_vp);
|
|
224
|
+
fp16[0] = __builtin_vsx_xvcvspbf16(fp16[0]);
|
|
225
|
+
fp16[1] = __builtin_vsx_xvcvspbf16(fp16[1]);
|
|
226
|
+
return vec_pack(reinterpret_cast<Packet4ui>(fp16[0]), reinterpret_cast<Packet4ui>(fp16[1]));
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
template <typename DataMapper, const Index size>
|
|
230
|
+
EIGEN_ALWAYS_INLINE void convertArrayF32toBF16Col(float* result, Index col, Index rows, const DataMapper& res) {
|
|
231
|
+
const DataMapper res2 = res.getSubMapper(0, col);
|
|
232
|
+
Index row;
|
|
233
|
+
float* result2 = result + col * rows;
|
|
234
|
+
for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
|
|
235
|
+
// get and save block
|
|
236
|
+
PacketBlock<Packet8bf, size> block;
|
|
237
|
+
BFLOAT16_UNROLL
|
|
238
|
+
for (Index j = 0; j < size; j++) {
|
|
239
|
+
block.packet[j] = convertF32toBF16(result2 + j * rows);
|
|
240
|
+
}
|
|
241
|
+
res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
|
|
242
|
+
}
|
|
243
|
+
// extra rows
|
|
244
|
+
if (row < rows) {
|
|
245
|
+
BFLOAT16_UNROLL
|
|
246
|
+
for (Index j = 0; j < size; j++) {
|
|
247
|
+
Packet8bf fp16 = convertF32toBF16(result2 + j * rows);
|
|
248
|
+
res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
template <const Index size, bool non_unit_stride = false>
|
|
254
|
+
EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst,
|
|
255
|
+
Index resInc = 1) {
|
|
256
|
+
constexpr Index extra = ((size < 8) ? 8 : size);
|
|
257
|
+
while (i + size <= rows) {
|
|
258
|
+
PacketBlock<Packet8bf, (size + 7) / 8> r32;
|
|
259
|
+
r32.packet[0] = convertF32toBF16(result + i + 0);
|
|
260
|
+
if (size >= 16) {
|
|
261
|
+
r32.packet[1] = convertF32toBF16(result + i + 8);
|
|
262
|
+
}
|
|
263
|
+
if (size >= 32) {
|
|
264
|
+
r32.packet[2] = convertF32toBF16(result + i + 16);
|
|
265
|
+
r32.packet[3] = convertF32toBF16(result + i + 24);
|
|
266
|
+
}
|
|
267
|
+
storeBF16fromResult<size, non_unit_stride, 0>(dst, r32.packet[0], resInc, rows & 7);
|
|
268
|
+
if (size >= 16) {
|
|
269
|
+
storeBF16fromResult<size, non_unit_stride, 8>(dst, r32.packet[1], resInc);
|
|
270
|
+
}
|
|
271
|
+
if (size >= 32) {
|
|
272
|
+
storeBF16fromResult<size, non_unit_stride, 16>(dst, r32.packet[2], resInc);
|
|
273
|
+
storeBF16fromResult<size, non_unit_stride, 24>(dst, r32.packet[3], resInc);
|
|
274
|
+
}
|
|
275
|
+
i += extra;
|
|
276
|
+
dst += extra * resInc;
|
|
277
|
+
if (size != 32) break;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
template <bool non_unit_stride = false>
|
|
282
|
+
EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
|
|
283
|
+
Index i = 0;
|
|
284
|
+
convertPointerF32toBF16<32, non_unit_stride>(i, result, rows, dst, resInc);
|
|
285
|
+
convertPointerF32toBF16<16, non_unit_stride>(i, result, rows, dst, resInc);
|
|
286
|
+
convertPointerF32toBF16<8, non_unit_stride>(i, result, rows, dst, resInc);
|
|
287
|
+
convertPointerF32toBF16<1, non_unit_stride>(i, result, rows, dst, resInc);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
template <typename DataMapper>
|
|
291
|
+
EIGEN_ALWAYS_INLINE void convertArrayF32toBF16(float* result, Index cols, Index rows, const DataMapper& res) {
|
|
292
|
+
Index col;
|
|
293
|
+
for (col = 0; col + 4 <= cols; col += 4) {
|
|
294
|
+
convertArrayF32toBF16Col<DataMapper, 4>(result, col, rows, res);
|
|
295
|
+
}
|
|
296
|
+
// extra cols
|
|
297
|
+
switch (cols - col) {
|
|
298
|
+
case 1:
|
|
299
|
+
convertArrayF32toBF16Col<DataMapper, 1>(result, col, rows, res);
|
|
300
|
+
break;
|
|
301
|
+
case 2:
|
|
302
|
+
convertArrayF32toBF16Col<DataMapper, 2>(result, col, rows, res);
|
|
303
|
+
break;
|
|
304
|
+
case 3:
|
|
305
|
+
convertArrayF32toBF16Col<DataMapper, 3>(result, col, rows, res);
|
|
306
|
+
break;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
template <Index size>
|
|
311
|
+
EIGEN_ALWAYS_INLINE void calcColLoops(const bfloat16*& indexA, Index& row, Index depth, Index cols, Index rows,
|
|
312
|
+
const Packet4f pAlpha, const bfloat16* indexB, Index strideB, Index offsetA,
|
|
313
|
+
Index offsetB, Index bigSuffix, float* result) {
|
|
314
|
+
if ((size == 16) || (rows & size)) {
|
|
315
|
+
indexA += size * offsetA;
|
|
316
|
+
colLoops<size>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
|
|
317
|
+
row += size;
|
|
318
|
+
indexA += bigSuffix * size / 16;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
template <typename DataMapper>
|
|
323
|
+
void gemmMMAbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
|
|
324
|
+
Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
|
|
325
|
+
float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
|
|
326
|
+
const Packet4f pAlpha = pset1<Packet4f>(falpha);
|
|
327
|
+
ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
|
|
328
|
+
|
|
329
|
+
convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
|
|
330
|
+
|
|
331
|
+
if (strideA == -1) strideA = depth;
|
|
332
|
+
if (strideB == -1) strideB = depth;
|
|
333
|
+
// Packing is done in blocks.
|
|
334
|
+
// There's 4 possible sizes of blocks
|
|
335
|
+
// Blocks of 8 columns with 16 elements (8x16)
|
|
336
|
+
// Blocks of 8 columns with 8 elements (8x8). This happens when there's 16 > rows >= 8
|
|
337
|
+
// Blocks of 8 columns with 4 elements (8x4). This happens when there's 8 > rows >= 4
|
|
338
|
+
// Blocks of 8 columns with < 4 elements. This happens when there's less than 4 remaining rows
|
|
339
|
+
|
|
340
|
+
// Loop for LHS standard block (8x16)
|
|
341
|
+
Index bigSuffix = (2 * 8) * (strideA - offsetA);
|
|
342
|
+
indexB += 4 * offsetB;
|
|
343
|
+
strideB *= 4;
|
|
344
|
+
offsetB *= 3;
|
|
345
|
+
|
|
346
|
+
Index row = 0;
|
|
347
|
+
while (row + 16 <= rows) {
|
|
348
|
+
calcColLoops<16>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
|
|
349
|
+
}
|
|
350
|
+
// LHS (8x8) block
|
|
351
|
+
calcColLoops<8>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
|
|
352
|
+
// LHS (8x4) block
|
|
353
|
+
calcColLoops<4>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
|
|
354
|
+
// extra rows
|
|
355
|
+
if (rows & 3) {
|
|
356
|
+
// This index is the beginning of remaining block.
|
|
357
|
+
colLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Convert back to bfloat16
|
|
361
|
+
convertArrayF32toBF16<DataMapper>(result, cols, rows, res);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
#undef MAX_BFLOAT16_ACC
|
|
365
|
+
|
|
366
|
+
#if !EIGEN_ALTIVEC_DISABLE_MMA
|
|
367
|
+
template <Index num_acc, typename LhsMapper, bool zero>
|
|
368
|
+
EIGEN_ALWAYS_INLINE void loadVecLoop(Index k, LhsMapper& lhs, Packet8bf (&a0)[num_acc], Packet8bf b1) {
|
|
369
|
+
a0[k + 0] = lhs.template loadPacket<Packet8bf>(k * 4, 0);
|
|
370
|
+
if (!zero) {
|
|
371
|
+
b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
|
|
372
|
+
}
|
|
373
|
+
if (num_acc > (k + 1)) {
|
|
374
|
+
a0[k + 1] = vec_mergel(a0[k + 0].m_val, b1.m_val);
|
|
375
|
+
}
|
|
376
|
+
a0[k + 0] = vec_mergeh(a0[k + 0].m_val, b1.m_val);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
template <Index num_acc>
|
|
380
|
+
EIGEN_ALWAYS_INLINE void multVec(__vector_quad (&quad_acc)[num_acc], Packet8bf (&a0)[num_acc], Packet8bf b0) {
|
|
381
|
+
BFLOAT16_UNROLL
|
|
382
|
+
for (Index k = 0; k < num_acc; k++) {
|
|
383
|
+
__builtin_mma_xvbf16ger2pp(&(quad_acc[k]), reinterpret_cast<Packet16uc>(b0.m_val),
|
|
384
|
+
reinterpret_cast<Packet16uc>(a0[k].m_val));
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
|
|
389
|
+
EIGEN_ALWAYS_INLINE void vecColLoop(Index j, LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc]) {
|
|
390
|
+
Packet8bf a0[num_acc];
|
|
391
|
+
Packet8bf b1 = pset1<Packet8bf>(Eigen::bfloat16(0));
|
|
392
|
+
Packet8bf b0 = loadColData<RhsMapper, linear>(rhs, j);
|
|
393
|
+
|
|
394
|
+
if (zero) {
|
|
395
|
+
b0 = vec_mergeh(b0.m_val, b1.m_val);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
using LhsSubMapper = typename LhsMapper::SubMapper;
|
|
399
|
+
|
|
400
|
+
LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
|
|
401
|
+
BFLOAT16_UNROLL
|
|
402
|
+
for (Index k = 0; k < num_acc; k += 2) {
|
|
403
|
+
loadVecLoop<num_acc, LhsSubMapper, zero>(k, lhs2, a0, b1);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
multVec<num_acc>(quad_acc, a0, b0);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
#define MAX_BFLOAT16_VEC_ACC 8
|
|
410
|
+
|
|
411
|
+
template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
|
|
412
|
+
void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
|
|
413
|
+
float* result) {
|
|
414
|
+
constexpr Index step = (num_acc * 4);
|
|
415
|
+
const Index extra_rows = (extraRows) ? (rows & 3) : 0;
|
|
416
|
+
constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC);
|
|
417
|
+
|
|
418
|
+
do {
|
|
419
|
+
Packet4f acc[num_acc][4];
|
|
420
|
+
__vector_quad quad_acc[num_acc];
|
|
421
|
+
|
|
422
|
+
zeroAccumulators<num_acc>(quad_acc);
|
|
423
|
+
|
|
424
|
+
using LhsSubMapper = typename LhsMapper::SubMapper;
|
|
425
|
+
|
|
426
|
+
LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
|
|
427
|
+
for (Index j = 0; j + 2 <= cend; j += 2) {
|
|
428
|
+
vecColLoop<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, quad_acc);
|
|
429
|
+
}
|
|
430
|
+
if (cend & 1) {
|
|
431
|
+
vecColLoop<num_acc, LhsSubMapper, RhsMapper, true, linear>(cend - 1, lhs2, rhs, quad_acc);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
disassembleAccumulators<num_acc>(quad_acc, acc);
|
|
435
|
+
|
|
436
|
+
outputVecColResults<num_acc, extraRows>(acc, result, pAlpha, extra_rows);
|
|
437
|
+
|
|
438
|
+
result += step;
|
|
439
|
+
} while (multiIters && (step <= rows - (row += step)));
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
|
|
443
|
+
EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
|
|
444
|
+
const Packet4f pAlpha, float* result) {
|
|
445
|
+
if (MAX_BFLOAT16_VEC_ACC > num_acc) {
|
|
446
|
+
colVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs,
|
|
447
|
+
pAlpha, result);
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
|
|
452
|
+
EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
|
|
453
|
+
const Packet4f pAlpha, float* result) {
|
|
454
|
+
switch ((rows - row) >> 2) {
|
|
455
|
+
case 7:
|
|
456
|
+
colVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
457
|
+
break;
|
|
458
|
+
case 6:
|
|
459
|
+
colVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
460
|
+
break;
|
|
461
|
+
case 5:
|
|
462
|
+
colVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
463
|
+
break;
|
|
464
|
+
case 4:
|
|
465
|
+
colVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
466
|
+
break;
|
|
467
|
+
case 3:
|
|
468
|
+
colVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
469
|
+
break;
|
|
470
|
+
case 2:
|
|
471
|
+
colVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
472
|
+
break;
|
|
473
|
+
case 1:
|
|
474
|
+
colVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
475
|
+
break;
|
|
476
|
+
default:
|
|
477
|
+
if (extraRows) {
|
|
478
|
+
colVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
479
|
+
}
|
|
480
|
+
break;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
template <typename LhsMapper, typename RhsMapper, bool linear>
|
|
485
|
+
EIGEN_ALWAYS_INLINE void calcVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
|
|
486
|
+
float* result) {
|
|
487
|
+
Index row = 0;
|
|
488
|
+
if (rows >= (MAX_BFLOAT16_VEC_ACC * 4)) {
|
|
489
|
+
colVecColLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha,
|
|
490
|
+
result);
|
|
491
|
+
result += row;
|
|
492
|
+
}
|
|
493
|
+
if (rows & 3) {
|
|
494
|
+
colVecColLoopBodyExtra<LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
495
|
+
} else {
|
|
496
|
+
colVecColLoopBodyExtra<LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
template <typename RhsMapper, typename LhsMapper, typename = void>
|
|
501
|
+
struct UseMMAStride : std::false_type {
|
|
502
|
+
static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
|
|
503
|
+
float* result) {
|
|
504
|
+
using RhsSubMapper = typename RhsMapper::SubMapper;
|
|
505
|
+
|
|
506
|
+
RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
|
|
507
|
+
calcVecColLoops<LhsMapper, RhsSubMapper, false>(jend - j2, rows, lhs, rhs2, pAlpha, result);
|
|
508
|
+
}
|
|
509
|
+
};
|
|
510
|
+
|
|
511
|
+
template <typename RhsMapper, typename LhsMapper>
|
|
512
|
+
struct UseMMAStride<RhsMapper, LhsMapper,
|
|
513
|
+
std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
|
|
514
|
+
: std::true_type {
|
|
515
|
+
static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
|
|
516
|
+
float* result) {
|
|
517
|
+
using RhsSubMapper = typename RhsMapper::SubMapper;
|
|
518
|
+
|
|
519
|
+
RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
|
|
520
|
+
if (rhs.stride() == 1) {
|
|
521
|
+
calcVecColLoops<LhsMapper, RhsSubMapper, true>(jend - j2, rows, lhs, rhs2, pAlpha, result);
|
|
522
|
+
} else {
|
|
523
|
+
calcVecColLoops<LhsMapper, RhsSubMapper, false>(jend - j2, rows, lhs, rhs2, pAlpha, result);
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
};
|
|
527
|
+
|
|
528
|
+
template <typename LhsMapper, typename RhsMapper>
|
|
529
|
+
void gemvMMA_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
|
|
530
|
+
Index resIncr, bfloat16 alpha) {
|
|
531
|
+
EIGEN_UNUSED_VARIABLE(resIncr);
|
|
532
|
+
eigen_internal_assert(resIncr == 1);
|
|
533
|
+
|
|
534
|
+
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
535
|
+
// This helps GCC to generate proper code.
|
|
536
|
+
LhsMapper lhs(alhs);
|
|
537
|
+
RhsMapper rhs2(rhs);
|
|
538
|
+
|
|
539
|
+
const Index lhsStride = lhs.stride();
|
|
540
|
+
|
|
541
|
+
// TODO: improve the following heuristic:
|
|
542
|
+
const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(bfloat16) < 16000 ? 16 : 8);
|
|
543
|
+
float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
|
|
544
|
+
Packet4f pAlpha = pset1<Packet4f>(falpha);
|
|
545
|
+
|
|
546
|
+
ei_declare_aligned_stack_constructed_variable(float, result, rows, 0);
|
|
547
|
+
|
|
548
|
+
convertArrayPointerBF16toF32(result, 1, rows, res);
|
|
549
|
+
|
|
550
|
+
for (Index j2 = 0; j2 < cols; j2 += block_cols) {
|
|
551
|
+
Index jend = numext::mini(j2 + block_cols, cols);
|
|
552
|
+
|
|
553
|
+
using LhsSubMapper = typename LhsMapper::SubMapper;
|
|
554
|
+
|
|
555
|
+
LhsSubMapper lhs2 = lhs.getSubMapper(0, j2);
|
|
556
|
+
UseMMAStride<RhsMapper, LhsSubMapper>::run(j2, jend, rows, lhs2, rhs2, pAlpha, result);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
convertArrayPointerF32toBF16(result, rows, res);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
static Packet16uc p16uc_ELEMENT_VEC3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
|
|
563
|
+
0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
|
|
564
|
+
|
|
565
|
+
template <Index num_acc>
|
|
566
|
+
EIGEN_ALWAYS_INLINE void preduxVecResults2(Packet4f (&acc)[num_acc][4], Index k) {
|
|
567
|
+
if (num_acc > (k + 1)) {
|
|
568
|
+
acc[k][0] = vec_mergeh(acc[k][0], acc[k + 1][0]);
|
|
569
|
+
acc[k][1] = vec_mergeo(acc[k][1], acc[k + 1][1]);
|
|
570
|
+
acc[k][2] = vec_mergel(acc[k][2], acc[k + 1][2]);
|
|
571
|
+
acc[k][3] = vec_perm(acc[k][3], acc[k + 1][3], p16uc_ELEMENT_VEC3);
|
|
572
|
+
|
|
573
|
+
acc[k][0] = (acc[k][0] + acc[k][2]) + (acc[k][1] + acc[k][3]);
|
|
574
|
+
} else {
|
|
575
|
+
acc[k][0] = vec_mergeh(acc[k][0], acc[k][1]);
|
|
576
|
+
acc[k][0] += vec_mergel(acc[k][2], acc[k][3]);
|
|
577
|
+
#ifdef _BIG_ENDIAN
|
|
578
|
+
acc[k][0] += vec_sld(acc[k][0], acc[k][0], 12);
|
|
579
|
+
#else
|
|
580
|
+
acc[k][0] += vec_sld(acc[k][0], acc[k][0], 4);
|
|
581
|
+
#endif
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
template <Index num_acc>
|
|
586
|
+
EIGEN_ALWAYS_INLINE void preduxVecResults(Packet4f (&acc)[num_acc][4]) {
|
|
587
|
+
BFLOAT16_UNROLL
|
|
588
|
+
for (Index k = 0; k < num_acc; k += 4) {
|
|
589
|
+
preduxVecResults2<num_acc>(acc, k + 0);
|
|
590
|
+
if (num_acc > (k + 2)) {
|
|
591
|
+
preduxVecResults2<num_acc>(acc, k + 2);
|
|
592
|
+
acc[k + 0][0] = reinterpret_cast<Packet4f>(
|
|
593
|
+
vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
|
|
599
|
+
EIGEN_ALWAYS_INLINE void multVecLoop(__vector_quad (&quad_acc)[num_acc], const LhsMapper& lhs, RhsMapper& rhs, Index j,
|
|
600
|
+
Index extra_cols) {
|
|
601
|
+
Packet8bf a0[num_acc], b0;
|
|
602
|
+
|
|
603
|
+
if (extra) {
|
|
604
|
+
b0 = rhs.template loadPacketPartial<Packet8bf>(j, extra_cols);
|
|
605
|
+
} else {
|
|
606
|
+
b0 = rhs.template loadPacket<Packet8bf>(j);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const LhsMapper lhs2 = lhs.getSubMapper(0, j);
|
|
610
|
+
BFLOAT16_UNROLL
|
|
611
|
+
for (Index k = 0; k < num_acc; k++) {
|
|
612
|
+
if (extra) {
|
|
613
|
+
a0[k] = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
|
|
614
|
+
} else {
|
|
615
|
+
a0[k] = lhs2.template loadPacket<Packet8bf>(k, 0);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
multVec<num_acc>(quad_acc, a0, b0);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
template <Index num_acc, typename LhsMapper, typename RhsMapper>
|
|
623
|
+
EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc],
|
|
624
|
+
Index extra_cols) {
|
|
625
|
+
Index j = 0;
|
|
626
|
+
for (; j + 8 <= cols; j += 8) {
|
|
627
|
+
multVecLoop<num_acc, LhsMapper, RhsMapper, false>(quad_acc, lhs, rhs, j, extra_cols);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if (extra_cols) {
|
|
631
|
+
multVecLoop<num_acc, LhsMapper, RhsMapper, true>(quad_acc, lhs, rhs, j, extra_cols);
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
template <const Index num_acc, typename LhsMapper, typename RhsMapper>
|
|
636
|
+
void colVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
|
|
637
|
+
float* result) {
|
|
638
|
+
constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC);
|
|
639
|
+
const Index extra_cols = (cols & 7);
|
|
640
|
+
|
|
641
|
+
do {
|
|
642
|
+
Packet4f acc[num_acc][4];
|
|
643
|
+
__vector_quad quad_acc[num_acc];
|
|
644
|
+
|
|
645
|
+
zeroAccumulators<num_acc>(quad_acc);
|
|
646
|
+
|
|
647
|
+
const LhsMapper lhs2 = lhs.getSubMapper(row, 0);
|
|
648
|
+
vecLoop<num_acc, LhsMapper, RhsMapper>(cols, lhs2, rhs, quad_acc, extra_cols);
|
|
649
|
+
|
|
650
|
+
disassembleAccumulators<num_acc>(quad_acc, acc);
|
|
651
|
+
|
|
652
|
+
preduxVecResults<num_acc>(acc);
|
|
653
|
+
|
|
654
|
+
outputVecResults<num_acc>(acc, result, pAlpha);
|
|
655
|
+
|
|
656
|
+
result += num_acc;
|
|
657
|
+
} while (multiIters && (num_acc <= rows - (row += num_acc)));
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
template <const Index num_acc, typename LhsMapper, typename RhsMapper>
|
|
661
|
+
EIGEN_ALWAYS_INLINE void colVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
|
|
662
|
+
const Packet4f pAlpha, float* result) {
|
|
663
|
+
if (MAX_BFLOAT16_VEC_ACC > num_acc) {
|
|
664
|
+
colVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
template <typename LhsMapper, typename RhsMapper>
|
|
669
|
+
EIGEN_ALWAYS_INLINE void colVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
|
|
670
|
+
const Packet4f pAlpha, float* result) {
|
|
671
|
+
switch (rows - row) {
|
|
672
|
+
case 7:
|
|
673
|
+
colVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
674
|
+
break;
|
|
675
|
+
case 6:
|
|
676
|
+
colVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
677
|
+
break;
|
|
678
|
+
case 5:
|
|
679
|
+
colVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
680
|
+
break;
|
|
681
|
+
case 4:
|
|
682
|
+
colVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
683
|
+
break;
|
|
684
|
+
case 3:
|
|
685
|
+
colVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
686
|
+
break;
|
|
687
|
+
case 2:
|
|
688
|
+
colVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
689
|
+
break;
|
|
690
|
+
case 1:
|
|
691
|
+
colVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
692
|
+
break;
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
template <typename LhsMapper, typename RhsMapper>
|
|
697
|
+
EIGEN_ALWAYS_INLINE void calcVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
|
|
698
|
+
float* result) {
|
|
699
|
+
Index row = 0;
|
|
700
|
+
if (rows >= MAX_BFLOAT16_VEC_ACC) {
|
|
701
|
+
colVecLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
702
|
+
result += row;
|
|
703
|
+
}
|
|
704
|
+
colVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
template <typename LhsMapper, typename RhsMapper>
|
|
708
|
+
EIGEN_STRONG_INLINE void gemvMMA_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
|
|
709
|
+
bfloat16* res, Index resIncr, bfloat16 alpha) {
|
|
710
|
+
typedef typename RhsMapper::LinearMapper LinearMapper;
|
|
711
|
+
|
|
712
|
+
// The following copy tells the compiler that lhs's attributes are not modified outside this function
|
|
713
|
+
// This helps GCC to generate proper code.
|
|
714
|
+
LhsMapper lhs(alhs);
|
|
715
|
+
LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
|
|
716
|
+
|
|
717
|
+
eigen_internal_assert(rhs.stride() == 1);
|
|
718
|
+
|
|
719
|
+
float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
|
|
720
|
+
const Packet4f pAlpha = pset1<Packet4f>(falpha);
|
|
721
|
+
|
|
722
|
+
ei_declare_aligned_stack_constructed_variable(float, result, rows, 0);
|
|
723
|
+
if (resIncr == 1) {
|
|
724
|
+
convertArrayPointerBF16toF32(result, 1, rows, res);
|
|
725
|
+
} else {
|
|
726
|
+
convertArrayPointerBF16toF32<true>(result, 1, rows, res, resIncr);
|
|
727
|
+
}
|
|
728
|
+
calcVecLoops<LhsMapper, LinearMapper>(cols, rows, lhs, rhs2, pAlpha, result);
|
|
729
|
+
if (resIncr == 1) {
|
|
730
|
+
convertArrayPointerF32toBF16(result, rows, res);
|
|
731
|
+
} else {
|
|
732
|
+
convertArrayPointerF32toBF16<true>(result, rows, res, resIncr);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
#endif
|
|
736
|
+
|
|
737
|
+
#undef MAX_BFLOAT16_VEC_ACC
|
|
738
|
+
#undef BFLOAT16_UNROLL
|
|
739
|
+
|
|
740
|
+
} // namespace internal
|
|
741
|
+
} // namespace Eigen
|
|
742
|
+
#endif // EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
|