RubyGems - tomoto - Versions diffs - 0.2.2 → 0.3.0 - Mend

tomoto 0.2.2 → 0.3.0

Files changed (369) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/README.md +8 -10
data/ext/tomoto/ct.cpp +11 -11
data/ext/tomoto/dmr.cpp +14 -13
data/ext/tomoto/dt.cpp +14 -14
data/ext/tomoto/extconf.rb +7 -5
data/ext/tomoto/gdmr.cpp +7 -7
data/ext/tomoto/hdp.cpp +9 -9
data/ext/tomoto/hlda.cpp +13 -13
data/ext/tomoto/hpa.cpp +5 -5
data/ext/tomoto/lda.cpp +42 -39
data/ext/tomoto/llda.cpp +6 -6
data/ext/tomoto/mglda.cpp +15 -15
data/ext/tomoto/pa.cpp +6 -6
data/ext/tomoto/plda.cpp +6 -6
data/ext/tomoto/slda.cpp +8 -8
data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
data/ext/tomoto/utils.h +16 -70
data/lib/tomoto/version.rb +1 -1
data/lib/tomoto.rb +5 -1
data/vendor/EigenRand/EigenRand/Core.h +10 -10
data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
data/vendor/EigenRand/EigenRand/EigenRand +11 -6
data/vendor/EigenRand/EigenRand/Macro.h +13 -7
data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
data/vendor/EigenRand/EigenRand/doc.h +24 -12
data/vendor/EigenRand/README.md +57 -4
data/vendor/eigen/COPYING.APACHE +203 -0
data/vendor/eigen/COPYING.BSD +1 -1
data/vendor/eigen/COPYING.MINPACK +51 -52
data/vendor/eigen/Eigen/Cholesky +0 -1
data/vendor/eigen/Eigen/Core +112 -265
data/vendor/eigen/Eigen/Eigenvalues +2 -3
data/vendor/eigen/Eigen/Geometry +5 -8
data/vendor/eigen/Eigen/Householder +0 -1
data/vendor/eigen/Eigen/Jacobi +0 -1
data/vendor/eigen/Eigen/KLUSupport +41 -0
data/vendor/eigen/Eigen/LU +2 -5
data/vendor/eigen/Eigen/OrderingMethods +0 -3
data/vendor/eigen/Eigen/PaStiXSupport +1 -0
data/vendor/eigen/Eigen/PardisoSupport +0 -0
data/vendor/eigen/Eigen/QR +2 -3
data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
data/vendor/eigen/Eigen/SVD +0 -1
data/vendor/eigen/Eigen/Sparse +0 -2
data/vendor/eigen/Eigen/SparseCholesky +0 -8
data/vendor/eigen/Eigen/SparseLU +4 -0
data/vendor/eigen/Eigen/SparseQR +0 -1
data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
data/vendor/eigen/README.md +2 -0
data/vendor/eigen/bench/btl/README +1 -1
data/vendor/eigen/bench/tensors/README +6 -7
data/vendor/eigen/ci/README.md +56 -0
data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
data/vendor/eigen/unsupported/README.txt +1 -1
data/vendor/tomotopy/README.kr.rst +78 -0
data/vendor/tomotopy/README.rst +75 -0
data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
data/vendor/tomotopy/src/Utils/exception.h +6 -0
data/vendor/tomotopy/src/Utils/math.h +2 -2
data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
metadata +64 -18
data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338

data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h ADDED Viewed

@@ -0,0 +1,700 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef EIGEN_BFLOAT16_H
+#define EIGEN_BFLOAT16_H
+#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)         \
+  template <>                                                       \
+  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED  \
+  PACKET_BF16 METHOD<PACKET_BF16>(const PACKET_BF16& _x) {          \
+    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));              \
+  }
+namespace Eigen {
+struct bfloat16;
+namespace bfloat16_impl {
+// Make our own __bfloat16_raw definition.
+struct __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {}
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {}
+  unsigned short value;
+};
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
+template <bool AssumeArgumentIsNormalOrInfinityOrZero>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff);
+// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying:
+// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff);
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h);
+struct bfloat16_base : public __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
+};
+} // namespace bfloat16_impl
+// Class definition.
+struct bfloat16 : public bfloat16_impl::bfloat16_base {
+  typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {}
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
+  explicit EIGEN_DEVICE_FUNC bfloat16(float f)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template<typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
+  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return bfloat16_impl::bfloat16_to_float(*this);
+  }
+};
+} // namespace Eigen
+namespace std {
+template<>
+struct numeric_limits<Eigen::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = std::denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = numeric_limits<float>::round_style;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = true;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 4;
+  static const int radix = 2;
+  static const int min_exponent = numeric_limits<float>::min_exponent;
+  static const int min_exponent10 = numeric_limits<float>::min_exponent10;
+  static const int max_exponent = numeric_limits<float>::max_exponent;
+  static const int max_exponent10 = numeric_limits<float>::max_exponent10;
+  static const bool traps = numeric_limits<float>::traps;
+  static const bool tinyness_before = numeric_limits<float>::tinyness_before;
+  static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+  static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
+  static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+  static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
+  static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); }
+  static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
+  static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
+  static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); }
+  static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
+};
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+template<>
+struct numeric_limits<volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+template<>
+struct numeric_limits<const volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+} // namespace std
+namespace Eigen {
+namespace bfloat16_impl {
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+// We need to provide emulated *host-side* BF16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+// Definitions for CPUs, mostly working through conversion
+// to/from fp32.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) {
+  return bfloat16(float(a) + static_cast<float>(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
+  bfloat16 result;
+  result.value = a.value ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) {
+  a += bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) {
+  a -= bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  ++a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  --a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) {
+  return numext::equal_strict(float(a),float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) {
+  return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) {
+  return float(a) >= float(b);
+}
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+#endif  // Emulate support for bfloat16 floats
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to bfloat16.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) {
+  __bfloat16_raw output;
+  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) {
+    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
+    return output;
+  }
+  const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  output.value = p[0];
+#else
+  output.value = p[1];
+#endif
+  return output;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
+  return __bfloat16_raw(value);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
+  return bf.value;
+}
+// float_to_bfloat16_rtne template specialization that does not make any
+// assumption about the value of its function argument (ff).
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) {
+#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
+  // Nothing to do here
+#else
+  __bfloat16_raw output;
+  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) {
+    // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+    // this makes sure after truncation we don't end up with an inf.
+    //
+    // qNaN magic: All exponent bits set + most significant bit of fraction
+    // set.
+    output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
+  } else {
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+    // At this point, ff must be either a normal float, or +/-infinity.
+    output = float_to_bfloat16_rtne<true>(ff);
+  }
+  return output;
+#endif
+}
+// float_to_bfloat16_rtne template specialization that assumes that its function
+// argument (ff) is either a normal floating point number, or +/-infinity, or
+// zero. Used to improve the runtime performance of conversion from an integer
+// type to bfloat16.
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
+#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
+    // Nothing to do here
+#else
+    numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
+    __bfloat16_raw output;
+    // Least significant bit of resulting bfloat.
+    numext::uint32_t lsb = (input >> 16) & 1;
+    numext::uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<numext::uint16_t>(input >> 16);
+    return output;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
+    float result = 0;
+    unsigned short* q = reinterpret_cast<unsigned short*>(&result);
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    q[0] = h.value;
+#else
+    q[1] = h.value;
+#endif
+    return result;
+}
+// --- standard functions ---
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) {
+  EIGEN_USING_STD(isinf);
+  return (isinf)(float(a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) {
+  EIGEN_USING_STD(isnan);
+  return (isnan)(float(a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
+  bfloat16 result;
+  result.value = a.value & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) {
+   return bfloat16(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) {
+  return bfloat16(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
+  return bfloat16(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
+  return bfloat16(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
+  return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
+    return bfloat16(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) {
+  return bfloat16(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) {
+  return bfloat16(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) {
+  return bfloat16(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) {
+  return bfloat16(::asinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) {
+  return bfloat16(::acosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) {
+  return bfloat16(::atanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) {
+  return bfloat16(::sinhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
+  return bfloat16(::coshf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(float(a)));
+}
+#if EIGEN_HAS_CXX11_MATH
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
+  return bfloat16(::asinhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
+  return bfloat16(::acoshf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
+  return bfloat16(::atanhf(float(a)));
+}
+#endif
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
+  return bfloat16(::rintf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::fmodf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fminf(f1, f2));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fmaxf(f1, f2));
+}
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+} // namespace bfloat16_impl
+namespace internal {
+template<>
+struct random_default_impl<bfloat16, false, false>
+{
+  static inline bfloat16 run(const bfloat16& x, const bfloat16& y)
+  {
+    return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline bfloat16 run()
+  {
+    return run(bfloat16(-1.f), bfloat16(1.f));
+  }
+};
+template<> struct is_arithmetic<bfloat16> { enum { value = true }; };
+} // namespace internal
+template<> struct NumTraits<Eigen::bfloat16>
+    : GenericNumTraits<Eigen::bfloat16>
+{
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D);  // bfloat16(5e-2f);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0);
+  }
+};
+} // namespace Eigen
+namespace Eigen {
+namespace numext {
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isnan)(h);
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isinf)(h);
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isfinite)(h);
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
+  return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src) {
+  return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
+}
+}  // namespace numext
+}  // namespace Eigen
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::bfloat16> {
+  EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // namespace std
+#endif
+#endif // EIGEN_BFLOAT16_H