@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -16,26 +16,69 @@ limitations under the License.
|
|
|
16
16
|
#ifndef EIGEN_BFLOAT16_H
|
|
17
17
|
#define EIGEN_BFLOAT16_H
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
// IWYU pragma: private
|
|
20
|
+
#include "../../InternalHeaderCheck.h"
|
|
21
|
+
|
|
22
|
+
#if defined(EIGEN_HAS_HIP_BF16)
|
|
23
|
+
// When compiling with GPU support, the "hip_bfloat16" base class as well as
|
|
24
|
+
// some other routines are defined in the GPU compiler header files
|
|
25
|
+
// (hip_bfloat16.h), and they are not tagged constexpr
|
|
26
|
+
// As a consequence, we get compile failures when compiling Eigen with
|
|
27
|
+
// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
|
|
28
|
+
// Eigen with GPU support
|
|
29
|
+
#pragma push_macro("EIGEN_CONSTEXPR")
|
|
30
|
+
#undef EIGEN_CONSTEXPR
|
|
31
|
+
#define EIGEN_CONSTEXPR
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \
|
|
35
|
+
template <> \
|
|
36
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED PACKET_BF16 METHOD<PACKET_BF16>( \
|
|
37
|
+
const PACKET_BF16& _x) { \
|
|
38
|
+
return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x))); \
|
|
24
39
|
}
|
|
25
40
|
|
|
41
|
+
// Only use HIP GPU bf16 in kernels
|
|
42
|
+
#if defined(EIGEN_HAS_HIP_BF16) && defined(EIGEN_GPU_COMPILE_PHASE)
|
|
43
|
+
#define EIGEN_USE_HIP_BF16
|
|
44
|
+
#endif
|
|
45
|
+
|
|
26
46
|
namespace Eigen {
|
|
27
47
|
|
|
28
48
|
struct bfloat16;
|
|
29
49
|
|
|
50
|
+
namespace numext {
|
|
51
|
+
template <>
|
|
52
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src);
|
|
53
|
+
|
|
54
|
+
template <>
|
|
55
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src);
|
|
56
|
+
} // namespace numext
|
|
30
57
|
namespace bfloat16_impl {
|
|
31
58
|
|
|
59
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
60
|
+
|
|
61
|
+
struct __bfloat16_raw : public hip_bfloat16 {
|
|
62
|
+
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() {}
|
|
63
|
+
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(hip_bfloat16 hb) : hip_bfloat16(hb) {}
|
|
64
|
+
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : hip_bfloat16(raw) {}
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
#else
|
|
68
|
+
|
|
32
69
|
// Make our own __bfloat16_raw definition.
|
|
33
70
|
struct __bfloat16_raw {
|
|
71
|
+
#if defined(EIGEN_HAS_HIP_BF16) && !defined(EIGEN_GPU_COMPILE_PHASE)
|
|
72
|
+
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() {}
|
|
73
|
+
#else
|
|
34
74
|
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {}
|
|
75
|
+
#endif
|
|
35
76
|
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {}
|
|
36
77
|
unsigned short value;
|
|
37
78
|
};
|
|
38
79
|
|
|
80
|
+
#endif // defined(EIGEN_USE_HIP_BF16)
|
|
81
|
+
|
|
39
82
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
|
|
40
83
|
template <bool AssumeArgumentIsNormalOrInfinityOrZero>
|
|
41
84
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff);
|
|
@@ -52,11 +95,10 @@ struct bfloat16_base : public __bfloat16_raw {
|
|
|
52
95
|
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
|
|
53
96
|
};
|
|
54
97
|
|
|
55
|
-
}
|
|
98
|
+
} // namespace bfloat16_impl
|
|
56
99
|
|
|
57
100
|
// Class definition.
|
|
58
101
|
struct bfloat16 : public bfloat16_impl::bfloat16_base {
|
|
59
|
-
|
|
60
102
|
typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
|
|
61
103
|
|
|
62
104
|
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
|
|
@@ -66,16 +108,17 @@ struct bfloat16 : public bfloat16_impl::bfloat16_base {
|
|
|
66
108
|
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
|
|
67
109
|
: bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
|
|
68
110
|
|
|
69
|
-
template<class T>
|
|
111
|
+
template <class T>
|
|
70
112
|
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
|
|
71
|
-
: bfloat16_impl::bfloat16_base(
|
|
113
|
+
: bfloat16_impl::bfloat16_base(
|
|
114
|
+
bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
|
|
72
115
|
|
|
73
116
|
explicit EIGEN_DEVICE_FUNC bfloat16(float f)
|
|
74
117
|
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
|
|
75
118
|
|
|
76
119
|
// Following the convention of numpy, converting between complex and
|
|
77
120
|
// float will lead to loss of imag value.
|
|
78
|
-
template<typename RealScalar>
|
|
121
|
+
template <typename RealScalar>
|
|
79
122
|
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
|
|
80
123
|
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
|
|
81
124
|
|
|
@@ -83,57 +126,122 @@ struct bfloat16 : public bfloat16_impl::bfloat16_base {
|
|
|
83
126
|
return bfloat16_impl::bfloat16_to_float(*this);
|
|
84
127
|
}
|
|
85
128
|
};
|
|
86
|
-
} // namespace Eigen
|
|
87
129
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
static const bool
|
|
94
|
-
static const bool
|
|
95
|
-
static const bool
|
|
96
|
-
static const bool
|
|
97
|
-
static const bool
|
|
98
|
-
static const
|
|
99
|
-
static const bool
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
static const
|
|
103
|
-
static const bool
|
|
104
|
-
|
|
105
|
-
static const
|
|
106
|
-
static const
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
static const
|
|
110
|
-
static const
|
|
111
|
-
static const int
|
|
112
|
-
static const
|
|
113
|
-
static const
|
|
114
|
-
|
|
115
|
-
static
|
|
116
|
-
static
|
|
117
|
-
static
|
|
118
|
-
static
|
|
119
|
-
static
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
static
|
|
123
|
-
|
|
130
|
+
// TODO(majnemer): Get rid of this once we can rely on C++17 inline variables do
|
|
131
|
+
// solve the ODR issue.
|
|
132
|
+
namespace bfloat16_impl {
|
|
133
|
+
template <typename = void>
|
|
134
|
+
struct numeric_limits_bfloat16_impl {
|
|
135
|
+
static EIGEN_CONSTEXPR const bool is_specialized = true;
|
|
136
|
+
static EIGEN_CONSTEXPR const bool is_signed = true;
|
|
137
|
+
static EIGEN_CONSTEXPR const bool is_integer = false;
|
|
138
|
+
static EIGEN_CONSTEXPR const bool is_exact = false;
|
|
139
|
+
static EIGEN_CONSTEXPR const bool has_infinity = true;
|
|
140
|
+
static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
|
|
141
|
+
static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
|
|
142
|
+
EIGEN_DIAGNOSTICS(push)
|
|
143
|
+
EIGEN_DISABLE_DEPRECATED_WARNING
|
|
144
|
+
static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present;
|
|
145
|
+
static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
|
|
146
|
+
EIGEN_DIAGNOSTICS(pop)
|
|
147
|
+
static EIGEN_CONSTEXPR const std::float_round_style round_style = std::numeric_limits<float>::round_style;
|
|
148
|
+
static EIGEN_CONSTEXPR const bool is_iec559 = true;
|
|
149
|
+
// The C++ standard defines this as "true if the set of values representable
|
|
150
|
+
// by the type is finite." BFloat16 has finite precision.
|
|
151
|
+
static EIGEN_CONSTEXPR const bool is_bounded = true;
|
|
152
|
+
static EIGEN_CONSTEXPR const bool is_modulo = false;
|
|
153
|
+
static EIGEN_CONSTEXPR const int digits = 8;
|
|
154
|
+
static EIGEN_CONSTEXPR const int digits10 = 2;
|
|
155
|
+
static EIGEN_CONSTEXPR const int max_digits10 = 4;
|
|
156
|
+
static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
|
|
157
|
+
static EIGEN_CONSTEXPR const int min_exponent = std::numeric_limits<float>::min_exponent;
|
|
158
|
+
static EIGEN_CONSTEXPR const int min_exponent10 = std::numeric_limits<float>::min_exponent10;
|
|
159
|
+
static EIGEN_CONSTEXPR const int max_exponent = std::numeric_limits<float>::max_exponent;
|
|
160
|
+
static EIGEN_CONSTEXPR const int max_exponent10 = std::numeric_limits<float>::max_exponent10;
|
|
161
|
+
static EIGEN_CONSTEXPR const bool traps = std::numeric_limits<float>::traps;
|
|
162
|
+
// IEEE754: "The implementer shall choose how tininess is detected, but shall
|
|
163
|
+
// detect tininess in the same way for all operations in radix two"
|
|
164
|
+
static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
|
|
165
|
+
|
|
166
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16(min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
|
|
167
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
|
|
168
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16(max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
|
|
169
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
|
|
170
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 round_error() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3f00); }
|
|
171
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
|
|
172
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
|
|
173
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() {
|
|
174
|
+
return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0);
|
|
175
|
+
}
|
|
176
|
+
static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
|
|
124
177
|
};
|
|
125
178
|
|
|
179
|
+
template <typename T>
|
|
180
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_specialized;
|
|
181
|
+
template <typename T>
|
|
182
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_signed;
|
|
183
|
+
template <typename T>
|
|
184
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_integer;
|
|
185
|
+
template <typename T>
|
|
186
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_exact;
|
|
187
|
+
template <typename T>
|
|
188
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_infinity;
|
|
189
|
+
template <typename T>
|
|
190
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_quiet_NaN;
|
|
191
|
+
template <typename T>
|
|
192
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_signaling_NaN;
|
|
193
|
+
EIGEN_DIAGNOSTICS(push)
|
|
194
|
+
EIGEN_DISABLE_DEPRECATED_WARNING
|
|
195
|
+
template <typename T>
|
|
196
|
+
EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_bfloat16_impl<T>::has_denorm;
|
|
197
|
+
template <typename T>
|
|
198
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_denorm_loss;
|
|
199
|
+
EIGEN_DIAGNOSTICS(pop)
|
|
200
|
+
template <typename T>
|
|
201
|
+
EIGEN_CONSTEXPR const std::float_round_style numeric_limits_bfloat16_impl<T>::round_style;
|
|
202
|
+
template <typename T>
|
|
203
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_iec559;
|
|
204
|
+
template <typename T>
|
|
205
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_bounded;
|
|
206
|
+
template <typename T>
|
|
207
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_modulo;
|
|
208
|
+
template <typename T>
|
|
209
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits;
|
|
210
|
+
template <typename T>
|
|
211
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits10;
|
|
212
|
+
template <typename T>
|
|
213
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_digits10;
|
|
214
|
+
template <typename T>
|
|
215
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::radix;
|
|
216
|
+
template <typename T>
|
|
217
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent;
|
|
218
|
+
template <typename T>
|
|
219
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent10;
|
|
220
|
+
template <typename T>
|
|
221
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent;
|
|
222
|
+
template <typename T>
|
|
223
|
+
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent10;
|
|
224
|
+
template <typename T>
|
|
225
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::traps;
|
|
226
|
+
template <typename T>
|
|
227
|
+
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::tinyness_before;
|
|
228
|
+
} // end namespace bfloat16_impl
|
|
229
|
+
} // end namespace Eigen
|
|
230
|
+
|
|
231
|
+
namespace std {
|
|
126
232
|
// If std::numeric_limits<T> is specialized, should also specialize
|
|
127
233
|
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
|
|
128
234
|
// std::numeric_limits<const volatile T>
|
|
129
235
|
// https://stackoverflow.com/a/16519653/
|
|
130
|
-
template<>
|
|
131
|
-
|
|
132
|
-
template<>
|
|
133
|
-
|
|
134
|
-
template<>
|
|
135
|
-
|
|
136
|
-
|
|
236
|
+
template <>
|
|
237
|
+
class numeric_limits<Eigen::bfloat16> : public Eigen::bfloat16_impl::numeric_limits_bfloat16_impl<> {};
|
|
238
|
+
template <>
|
|
239
|
+
class numeric_limits<const Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
|
|
240
|
+
template <>
|
|
241
|
+
class numeric_limits<volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
|
|
242
|
+
template <>
|
|
243
|
+
class numeric_limits<const volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
|
|
244
|
+
} // end namespace std
|
|
137
245
|
|
|
138
246
|
namespace Eigen {
|
|
139
247
|
|
|
@@ -142,15 +250,15 @@ namespace bfloat16_impl {
|
|
|
142
250
|
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
|
|
143
251
|
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
|
|
144
252
|
// of the functions, while the latter can only deal with one of them.
|
|
145
|
-
#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
|
253
|
+
#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
|
|
146
254
|
|
|
147
255
|
#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
|
|
148
256
|
// We need to provide emulated *host-side* BF16 operators for clang.
|
|
149
257
|
#pragma push_macro("EIGEN_DEVICE_FUNC")
|
|
150
258
|
#undef EIGEN_DEVICE_FUNC
|
|
151
|
-
#if defined(
|
|
259
|
+
#if (defined(EIGEN_HAS_GPU_BF16) && defined(EIGEN_HAS_NATIVE_BF16))
|
|
152
260
|
#define EIGEN_DEVICE_FUNC __host__
|
|
153
|
-
#else
|
|
261
|
+
#else // both host and device need emulated ops.
|
|
154
262
|
#define EIGEN_DEVICE_FUNC __host__ __device__
|
|
155
263
|
#endif
|
|
156
264
|
#endif
|
|
@@ -158,42 +266,41 @@ namespace bfloat16_impl {
|
|
|
158
266
|
// Definitions for CPUs, mostly working through conversion
|
|
159
267
|
// to/from fp32.
|
|
160
268
|
|
|
161
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
269
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
|
|
162
270
|
return bfloat16(float(a) + float(b));
|
|
163
271
|
}
|
|
164
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
272
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const int& b) {
|
|
165
273
|
return bfloat16(float(a) + static_cast<float>(b));
|
|
166
274
|
}
|
|
167
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
275
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const int& a, const bfloat16& b) {
|
|
168
276
|
return bfloat16(static_cast<float>(a) + float(b));
|
|
169
277
|
}
|
|
170
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
278
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
|
|
171
279
|
return bfloat16(float(a) * float(b));
|
|
172
280
|
}
|
|
173
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
281
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
|
|
174
282
|
return bfloat16(float(a) - float(b));
|
|
175
283
|
}
|
|
176
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
284
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
|
|
177
285
|
return bfloat16(float(a) / float(b));
|
|
178
286
|
}
|
|
179
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
return result;
|
|
287
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a) {
|
|
288
|
+
numext::uint16_t x = numext::bit_cast<uint16_t>(a) ^ 0x8000;
|
|
289
|
+
return numext::bit_cast<bfloat16>(x);
|
|
183
290
|
}
|
|
184
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator
|
|
291
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator+=(bfloat16& a, const bfloat16& b) {
|
|
185
292
|
a = bfloat16(float(a) + float(b));
|
|
186
293
|
return a;
|
|
187
294
|
}
|
|
188
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator
|
|
295
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator*=(bfloat16& a, const bfloat16& b) {
|
|
189
296
|
a = bfloat16(float(a) * float(b));
|
|
190
297
|
return a;
|
|
191
298
|
}
|
|
192
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator
|
|
299
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator-=(bfloat16& a, const bfloat16& b) {
|
|
193
300
|
a = bfloat16(float(a) - float(b));
|
|
194
301
|
return a;
|
|
195
302
|
}
|
|
196
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator
|
|
303
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator/=(bfloat16& a, const bfloat16& b) {
|
|
197
304
|
a = bfloat16(float(a) / float(b));
|
|
198
305
|
return a;
|
|
199
306
|
}
|
|
@@ -215,22 +322,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
|
|
|
215
322
|
--a;
|
|
216
323
|
return original_value;
|
|
217
324
|
}
|
|
218
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator
|
|
219
|
-
return numext::equal_strict(float(a),float(b));
|
|
325
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const bfloat16& a, const bfloat16& b) {
|
|
326
|
+
return numext::equal_strict(float(a), float(b));
|
|
220
327
|
}
|
|
221
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator
|
|
328
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const bfloat16& a, const bfloat16& b) {
|
|
222
329
|
return numext::not_equal_strict(float(a), float(b));
|
|
223
330
|
}
|
|
224
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator
|
|
331
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const bfloat16& a, const bfloat16& b) {
|
|
225
332
|
return float(a) < float(b);
|
|
226
333
|
}
|
|
227
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator
|
|
334
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const bfloat16& a, const bfloat16& b) {
|
|
228
335
|
return float(a) <= float(b);
|
|
229
336
|
}
|
|
230
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator
|
|
337
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const bfloat16& a, const bfloat16& b) {
|
|
231
338
|
return float(a) > float(b);
|
|
232
339
|
}
|
|
233
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator
|
|
340
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const bfloat16& a, const bfloat16& b) {
|
|
234
341
|
return float(a) >= float(b);
|
|
235
342
|
}
|
|
236
343
|
|
|
@@ -241,49 +348,59 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const
|
|
|
241
348
|
|
|
242
349
|
// Division by an index. Do it in full float precision to avoid accuracy
|
|
243
350
|
// issues in converting the denominator to bfloat16.
|
|
244
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator
|
|
351
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, Index b) {
|
|
245
352
|
return bfloat16(static_cast<float>(a) / static_cast<float>(b));
|
|
246
353
|
}
|
|
247
354
|
|
|
248
355
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) {
|
|
356
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
357
|
+
return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(v, __bfloat16_raw::truncate));
|
|
358
|
+
#else
|
|
249
359
|
__bfloat16_raw output;
|
|
250
|
-
if (
|
|
251
|
-
output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
|
|
360
|
+
if (numext::isnan EIGEN_NOT_A_MACRO(v)) {
|
|
361
|
+
output.value = std::signbit(v) ? 0xFFC0 : 0x7FC0;
|
|
252
362
|
return output;
|
|
253
363
|
}
|
|
254
|
-
|
|
255
|
-
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
256
|
-
output.value = p[0];
|
|
257
|
-
#else
|
|
258
|
-
output.value = p[1];
|
|
259
|
-
#endif
|
|
364
|
+
output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
|
|
260
365
|
return output;
|
|
366
|
+
#endif
|
|
261
367
|
}
|
|
262
368
|
|
|
263
369
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
|
|
370
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
371
|
+
__bfloat16_raw bf;
|
|
372
|
+
bf.data = value;
|
|
373
|
+
return bf;
|
|
374
|
+
#else
|
|
264
375
|
return __bfloat16_raw(value);
|
|
376
|
+
#endif
|
|
265
377
|
}
|
|
266
378
|
|
|
267
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(
|
|
379
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(
|
|
380
|
+
const __bfloat16_raw& bf) {
|
|
381
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
382
|
+
return bf.data;
|
|
383
|
+
#else
|
|
268
384
|
return bf.value;
|
|
385
|
+
#endif
|
|
269
386
|
}
|
|
270
387
|
|
|
271
388
|
// float_to_bfloat16_rtne template specialization that does not make any
|
|
272
389
|
// assumption about the value of its function argument (ff).
|
|
273
390
|
template <>
|
|
274
391
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) {
|
|
275
|
-
#if
|
|
276
|
-
|
|
392
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
393
|
+
return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
|
|
277
394
|
#else
|
|
278
395
|
__bfloat16_raw output;
|
|
279
396
|
|
|
280
|
-
if (
|
|
397
|
+
if (numext::isnan EIGEN_NOT_A_MACRO(ff)) {
|
|
281
398
|
// If the value is a NaN, squash it to a qNaN with msb of fraction set,
|
|
282
399
|
// this makes sure after truncation we don't end up with an inf.
|
|
283
400
|
//
|
|
284
401
|
// qNaN magic: All exponent bits set + most significant bit of fraction
|
|
285
402
|
// set.
|
|
286
|
-
output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
|
|
403
|
+
output.value = std::signbit(ff) ? 0xFFC0 : 0x7FC0;
|
|
287
404
|
} else {
|
|
288
405
|
// Fast rounding algorithm that rounds a half value to nearest even. This
|
|
289
406
|
// reduces expected error when we convert a large number of floats. Here
|
|
@@ -446,134 +563,99 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<fals
|
|
|
446
563
|
// type to bfloat16.
|
|
447
564
|
template <>
|
|
448
565
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
|
|
449
|
-
#if
|
|
450
|
-
|
|
566
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
567
|
+
return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
|
|
451
568
|
#else
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
569
|
+
numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
|
|
570
|
+
__bfloat16_raw output;
|
|
571
|
+
|
|
572
|
+
// Least significant bit of resulting bfloat.
|
|
573
|
+
numext::uint32_t lsb = (input >> 16) & 1;
|
|
574
|
+
numext::uint32_t rounding_bias = 0x7fff + lsb;
|
|
575
|
+
input += rounding_bias;
|
|
576
|
+
output.value = static_cast<numext::uint16_t>(input >> 16);
|
|
577
|
+
return output;
|
|
461
578
|
#endif
|
|
462
579
|
}
|
|
463
580
|
|
|
464
581
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
468
|
-
q[0] = h.value;
|
|
582
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
583
|
+
return static_cast<float>(h);
|
|
469
584
|
#else
|
|
470
|
-
|
|
585
|
+
return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
|
|
471
586
|
#endif
|
|
472
|
-
return result;
|
|
473
587
|
}
|
|
588
|
+
|
|
474
589
|
// --- standard functions ---
|
|
475
590
|
|
|
476
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool
|
|
591
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const bfloat16& a) {
|
|
477
592
|
EIGEN_USING_STD(isinf);
|
|
593
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
594
|
+
return (isinf)(a); // Uses HIP hip_bfloat16 isinf operator
|
|
595
|
+
#else
|
|
478
596
|
return (isinf)(float(a));
|
|
597
|
+
#endif
|
|
479
598
|
}
|
|
480
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool
|
|
599
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const bfloat16& a) {
|
|
481
600
|
EIGEN_USING_STD(isnan);
|
|
601
|
+
#if defined(EIGEN_USE_HIP_BF16)
|
|
602
|
+
return (isnan)(a); // Uses HIP hip_bfloat16 isnan operator
|
|
603
|
+
#else
|
|
482
604
|
return (isnan)(float(a));
|
|
605
|
+
#endif
|
|
483
606
|
}
|
|
484
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool
|
|
485
|
-
return !(isinf EIGEN_NOT_A_MACRO
|
|
607
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const bfloat16& a) {
|
|
608
|
+
return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
|
|
486
609
|
}
|
|
487
610
|
|
|
488
611
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
}
|
|
493
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
494
|
-
|
|
495
|
-
}
|
|
496
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
497
|
-
|
|
498
|
-
}
|
|
499
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
|
|
500
|
-
return bfloat16(::logf(float(a)));
|
|
501
|
-
}
|
|
502
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
|
|
503
|
-
return bfloat16(numext::log1p(float(a)));
|
|
504
|
-
}
|
|
505
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
|
|
506
|
-
return bfloat16(::log10f(float(a)));
|
|
507
|
-
}
|
|
612
|
+
numext::uint16_t x = numext::bit_cast<numext::uint16_t>(a) & 0x7FFF;
|
|
613
|
+
return numext::bit_cast<bfloat16>(x);
|
|
614
|
+
}
|
|
615
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { return bfloat16(::expf(float(a))); }
|
|
616
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp2(const bfloat16& a) { return bfloat16(::exp2f(float(a))); }
|
|
617
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { return bfloat16(numext::expm1(float(a))); }
|
|
618
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { return bfloat16(::logf(float(a))); }
|
|
619
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { return bfloat16(numext::log1p(float(a))); }
|
|
620
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { return bfloat16(::log10f(float(a))); }
|
|
508
621
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
|
|
509
622
|
return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
|
|
510
623
|
}
|
|
511
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
|
|
512
|
-
return bfloat16(::sqrtf(float(a)));
|
|
513
|
-
}
|
|
624
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { return bfloat16(::sqrtf(float(a))); }
|
|
514
625
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
|
|
515
626
|
return bfloat16(::powf(float(a), float(b)));
|
|
516
627
|
}
|
|
517
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
518
|
-
return bfloat16(::
|
|
519
|
-
}
|
|
520
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
521
|
-
|
|
522
|
-
}
|
|
523
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
524
|
-
|
|
525
|
-
}
|
|
526
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
527
|
-
|
|
528
|
-
}
|
|
529
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
530
|
-
|
|
531
|
-
}
|
|
532
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
533
|
-
|
|
534
|
-
}
|
|
535
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
536
|
-
|
|
537
|
-
}
|
|
538
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
|
|
539
|
-
return bfloat16(::coshf(float(a)));
|
|
540
|
-
}
|
|
541
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
|
|
542
|
-
return bfloat16(::tanhf(float(a)));
|
|
543
|
-
}
|
|
544
|
-
#if EIGEN_HAS_CXX11_MATH
|
|
545
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
|
|
546
|
-
return bfloat16(::asinhf(float(a)));
|
|
547
|
-
}
|
|
548
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
|
|
549
|
-
return bfloat16(::acoshf(float(a)));
|
|
550
|
-
}
|
|
551
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
|
|
552
|
-
return bfloat16(::atanhf(float(a)));
|
|
553
|
-
}
|
|
554
|
-
#endif
|
|
555
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
|
|
556
|
-
return bfloat16(::floorf(float(a)));
|
|
557
|
-
}
|
|
558
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
|
|
559
|
-
return bfloat16(::ceilf(float(a)));
|
|
560
|
-
}
|
|
561
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
|
|
562
|
-
return bfloat16(::rintf(float(a)));
|
|
563
|
-
}
|
|
564
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
|
|
565
|
-
return bfloat16(::roundf(float(a)));
|
|
566
|
-
}
|
|
628
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan2(const bfloat16& a, const bfloat16& b) {
|
|
629
|
+
return bfloat16(::atan2f(float(a), float(b)));
|
|
630
|
+
}
|
|
631
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { return bfloat16(::sinf(float(a))); }
|
|
632
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { return bfloat16(::cosf(float(a))); }
|
|
633
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { return bfloat16(::tanf(float(a))); }
|
|
634
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { return bfloat16(::asinf(float(a))); }
|
|
635
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { return bfloat16(::acosf(float(a))); }
|
|
636
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { return bfloat16(::atanf(float(a))); }
|
|
637
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { return bfloat16(::sinhf(float(a))); }
|
|
638
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { return bfloat16(::coshf(float(a))); }
|
|
639
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { return bfloat16(::tanhf(float(a))); }
|
|
640
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { return bfloat16(::asinhf(float(a))); }
|
|
641
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { return bfloat16(::acoshf(float(a))); }
|
|
642
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { return bfloat16(::atanhf(float(a))); }
|
|
643
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { return bfloat16(::floorf(float(a))); }
|
|
644
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { return bfloat16(::ceilf(float(a))); }
|
|
645
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { return bfloat16(::rintf(float(a))); }
|
|
646
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { return bfloat16(::roundf(float(a))); }
|
|
647
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 trunc(const bfloat16& a) { return bfloat16(::truncf(float(a))); }
|
|
567
648
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
|
|
568
649
|
return bfloat16(::fmodf(float(a), float(b)));
|
|
569
650
|
}
|
|
570
651
|
|
|
571
|
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16
|
|
652
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(min)(const bfloat16& a, const bfloat16& b) {
|
|
572
653
|
const float f1 = static_cast<float>(a);
|
|
573
654
|
const float f2 = static_cast<float>(b);
|
|
574
655
|
return f2 < f1 ? b : a;
|
|
575
656
|
}
|
|
576
|
-
|
|
657
|
+
|
|
658
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(max)(const bfloat16& a, const bfloat16& b) {
|
|
577
659
|
const float f1 = static_cast<float>(a);
|
|
578
660
|
const float f2 = static_cast<float>(b);
|
|
579
661
|
return f1 < f2 ? b : a;
|
|
@@ -584,56 +666,59 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfl
|
|
|
584
666
|
const float f2 = static_cast<float>(b);
|
|
585
667
|
return bfloat16(::fminf(f1, f2));
|
|
586
668
|
}
|
|
669
|
+
|
|
587
670
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) {
|
|
588
671
|
const float f1 = static_cast<float>(a);
|
|
589
672
|
const float f2 = static_cast<float>(b);
|
|
590
673
|
return bfloat16(::fmaxf(f1, f2));
|
|
591
674
|
}
|
|
592
675
|
|
|
676
|
+
EIGEN_DEVICE_FUNC inline bfloat16 fma(const bfloat16& a, const bfloat16& b, const bfloat16& c) {
|
|
677
|
+
// Emulate FMA via float.
|
|
678
|
+
return bfloat16(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
|
|
679
|
+
}
|
|
680
|
+
|
|
593
681
|
#ifndef EIGEN_NO_IO
|
|
594
|
-
EIGEN_ALWAYS_INLINE std::ostream& operator
|
|
682
|
+
EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
|
|
595
683
|
os << static_cast<float>(v);
|
|
596
684
|
return os;
|
|
597
685
|
}
|
|
598
686
|
#endif
|
|
599
687
|
|
|
600
|
-
}
|
|
688
|
+
} // namespace bfloat16_impl
|
|
601
689
|
|
|
602
690
|
namespace internal {
|
|
603
691
|
|
|
604
|
-
template<>
|
|
605
|
-
struct
|
|
606
|
-
{
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
692
|
+
template <>
|
|
693
|
+
struct is_arithmetic<bfloat16> {
|
|
694
|
+
enum { value = true };
|
|
695
|
+
};
|
|
696
|
+
|
|
697
|
+
template <>
|
|
698
|
+
struct random_impl<bfloat16> {
|
|
699
|
+
enum : int { MantissaBits = 7 };
|
|
700
|
+
using Impl = random_impl<float>;
|
|
701
|
+
static EIGEN_DEVICE_FUNC inline bfloat16 run(const bfloat16& x, const bfloat16& y) {
|
|
702
|
+
float result = Impl::run(x, y, MantissaBits);
|
|
703
|
+
return bfloat16(result);
|
|
610
704
|
}
|
|
611
|
-
static inline bfloat16 run()
|
|
612
|
-
|
|
613
|
-
return
|
|
705
|
+
static EIGEN_DEVICE_FUNC inline bfloat16 run() {
|
|
706
|
+
float result = Impl::run(MantissaBits);
|
|
707
|
+
return bfloat16(result);
|
|
614
708
|
}
|
|
615
709
|
};
|
|
616
710
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
} // namespace internal
|
|
711
|
+
} // namespace internal
|
|
620
712
|
|
|
621
|
-
template<>
|
|
622
|
-
|
|
623
|
-
{
|
|
624
|
-
enum {
|
|
625
|
-
IsSigned = true,
|
|
626
|
-
IsInteger = false,
|
|
627
|
-
IsComplex = false,
|
|
628
|
-
RequireInitialization = false
|
|
629
|
-
};
|
|
713
|
+
template <>
|
|
714
|
+
struct NumTraits<Eigen::bfloat16> : GenericNumTraits<Eigen::bfloat16> {
|
|
715
|
+
enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
|
|
630
716
|
|
|
631
717
|
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
|
|
632
718
|
return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
|
|
633
719
|
}
|
|
634
720
|
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() {
|
|
635
721
|
return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D); // bfloat16(5e-2f);
|
|
636
|
-
|
|
637
722
|
}
|
|
638
723
|
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() {
|
|
639
724
|
return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F);
|
|
@@ -649,32 +734,33 @@ template<> struct NumTraits<Eigen::bfloat16>
|
|
|
649
734
|
}
|
|
650
735
|
};
|
|
651
736
|
|
|
652
|
-
}
|
|
737
|
+
} // namespace Eigen
|
|
738
|
+
|
|
739
|
+
#if defined(EIGEN_HAS_HIP_BF16)
|
|
740
|
+
#pragma pop_macro("EIGEN_CONSTEXPR")
|
|
741
|
+
#endif
|
|
653
742
|
|
|
654
743
|
namespace Eigen {
|
|
655
744
|
namespace numext {
|
|
656
745
|
|
|
657
|
-
template<>
|
|
658
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
659
|
-
bool (isnan)(const Eigen::bfloat16& h) {
|
|
746
|
+
template <>
|
|
747
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::bfloat16& h) {
|
|
660
748
|
return (bfloat16_impl::isnan)(h);
|
|
661
749
|
}
|
|
662
750
|
|
|
663
|
-
template<>
|
|
664
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
665
|
-
bool (isinf)(const Eigen::bfloat16& h) {
|
|
751
|
+
template <>
|
|
752
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::bfloat16& h) {
|
|
666
753
|
return (bfloat16_impl::isinf)(h);
|
|
667
754
|
}
|
|
668
755
|
|
|
669
|
-
template<>
|
|
670
|
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
671
|
-
bool (isfinite)(const Eigen::bfloat16& h) {
|
|
756
|
+
template <>
|
|
757
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::bfloat16& h) {
|
|
672
758
|
return (bfloat16_impl::isfinite)(h);
|
|
673
759
|
}
|
|
674
760
|
|
|
675
761
|
template <>
|
|
676
762
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
|
|
677
|
-
return Eigen::
|
|
763
|
+
return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src);
|
|
678
764
|
}
|
|
679
765
|
|
|
680
766
|
template <>
|
|
@@ -682,6 +768,37 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat1
|
|
|
682
768
|
return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
|
|
683
769
|
}
|
|
684
770
|
|
|
771
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 nextafter(const bfloat16& from, const bfloat16& to) {
|
|
772
|
+
if (numext::isnan EIGEN_NOT_A_MACRO(from)) {
|
|
773
|
+
return from;
|
|
774
|
+
}
|
|
775
|
+
if (numext::isnan EIGEN_NOT_A_MACRO(to)) {
|
|
776
|
+
return to;
|
|
777
|
+
}
|
|
778
|
+
if (from == to) {
|
|
779
|
+
return to;
|
|
780
|
+
}
|
|
781
|
+
uint16_t from_bits = numext::bit_cast<uint16_t>(from);
|
|
782
|
+
bool from_sign = from_bits >> 15;
|
|
783
|
+
// Whether we are adjusting toward the infinity with the same sign as from.
|
|
784
|
+
bool toward_inf = (to > from) == !from_sign;
|
|
785
|
+
if (toward_inf) {
|
|
786
|
+
++from_bits;
|
|
787
|
+
} else if ((from_bits & 0x7fff) == 0) {
|
|
788
|
+
// Adjusting away from inf, but from is zero, so just toggle the sign.
|
|
789
|
+
from_bits ^= 0x8000;
|
|
790
|
+
} else {
|
|
791
|
+
--from_bits;
|
|
792
|
+
}
|
|
793
|
+
return numext::bit_cast<bfloat16>(from_bits);
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
// Specialize multiply-add to match packet operations and reduce conversions to/from float.
|
|
797
|
+
template<>
|
|
798
|
+
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 madd<Eigen::bfloat16>(const Eigen::bfloat16& x, const Eigen::bfloat16& y, const Eigen::bfloat16& z) {
|
|
799
|
+
return Eigen::bfloat16(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
|
|
800
|
+
}
|
|
801
|
+
|
|
685
802
|
} // namespace numext
|
|
686
803
|
} // namespace Eigen
|
|
687
804
|
|
|
@@ -693,8 +810,57 @@ struct hash<Eigen::bfloat16> {
|
|
|
693
810
|
return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
|
|
694
811
|
}
|
|
695
812
|
};
|
|
696
|
-
}
|
|
813
|
+
} // namespace std
|
|
697
814
|
#endif
|
|
698
815
|
|
|
816
|
+
// Add the missing shfl* intrinsics.
|
|
817
|
+
// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
|
|
818
|
+
// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
|
|
819
|
+
//
|
|
820
|
+
// HIP and CUDA prior to SDK 9.0 define
|
|
821
|
+
// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
|
|
822
|
+
// CUDA since 9.0 deprecates those and instead defines
|
|
823
|
+
// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
|
|
824
|
+
// with native support for __half and __nv_bfloat16
|
|
825
|
+
//
|
|
826
|
+
// Note that the following are __device__ - only functions.
|
|
827
|
+
#if defined(EIGEN_HIPCC)
|
|
828
|
+
|
|
829
|
+
#if defined(EIGEN_HAS_HIP_BF16)
|
|
830
|
+
|
|
831
|
+
__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width = warpSize) {
|
|
832
|
+
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
|
|
833
|
+
return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta,
|
|
837
|
+
int width = warpSize) {
|
|
838
|
+
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
|
|
839
|
+
return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta,
|
|
843
|
+
int width = warpSize) {
|
|
844
|
+
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
|
|
845
|
+
return Eigen::numext::bit_cast<Eigen::bfloat16>(
|
|
846
|
+
static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width = warpSize) {
|
|
850
|
+
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
|
|
851
|
+
return Eigen::numext::bit_cast<Eigen::bfloat16>(
|
|
852
|
+
static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
#endif // HIP
|
|
856
|
+
|
|
857
|
+
#endif // __shfl*
|
|
858
|
+
|
|
859
|
+
#if defined(EIGEN_HIPCC)
|
|
860
|
+
EIGEN_STRONG_INLINE __device__ Eigen::bfloat16 __ldg(const Eigen::bfloat16* ptr) {
|
|
861
|
+
return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(
|
|
862
|
+
__ldg(Eigen::numext::bit_cast<const Eigen::numext::uint16_t*>(ptr)));
|
|
863
|
+
}
|
|
864
|
+
#endif // __ldg
|
|
699
865
|
|
|
700
|
-
#endif
|
|
866
|
+
#endif // EIGEN_BFLOAT16_H
|