@smake/eigen 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -21
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +235 -326
- package/eigen/Eigen/Eigenvalues +16 -14
- package/eigen/Eigen/Geometry +21 -24
- package/eigen/Eigen/Householder +9 -8
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -14
- package/eigen/Eigen/KLUSupport +43 -0
- package/eigen/Eigen/LU +16 -20
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -54
- package/eigen/Eigen/PaStiXSupport +23 -20
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -21
- package/eigen/Eigen/QtAlignedMalloc +5 -13
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -18
- package/eigen/Eigen/Sparse +1 -4
- package/eigen/Eigen/SparseCholesky +18 -23
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +12 -8
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
- package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
- package/eigen/Eigen/src/Core/Array.h +341 -294
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
- package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
- package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
- package/eigen/Eigen/src/Core/Block.h +375 -398
- package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
- package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
- package/eigen/Eigen/src/Core/DenseBase.h +632 -571
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
- package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +169 -210
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +172 -222
- package/eigen/Eigen/src/Core/EigenBase.h +75 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
- package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
- package/eigen/Eigen/src/Core/IO.h +147 -139
- package/eigen/Eigen/src/Core/IndexedView.h +321 -0
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +56 -66
- package/eigen/Eigen/src/Core/Map.h +124 -142
- package/eigen/Eigen/src/Core/MapBase.h +256 -281
- package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
- package/eigen/Eigen/src/Core/Matrix.h +491 -416
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
- package/eigen/Eigen/src/Core/NestByValue.h +66 -85
- package/eigen/Eigen/src/Core/NoAlias.h +79 -85
- package/eigen/Eigen/src/Core/NumTraits.h +235 -148
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
- package/eigen/Eigen/src/Core/Product.h +260 -139
- package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
- package/eigen/Eigen/src/Core/Random.h +161 -136
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +366 -336
- package/eigen/Eigen/src/Core/Ref.h +308 -209
- package/eigen/Eigen/src/Core/Replicate.h +94 -106
- package/eigen/Eigen/src/Core/Reshaped.h +398 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
- package/eigen/Eigen/src/Core/Reverse.h +136 -145
- package/eigen/Eigen/src/Core/Select.h +70 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +97 -111
- package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
- package/eigen/Eigen/src/Core/SolverBase.h +138 -101
- package/eigen/Eigen/src/Core/StableNorm.h +156 -160
- package/eigen/Eigen/src/Core/StlIterators.h +619 -0
- package/eigen/Eigen/src/Core/Stride.h +91 -88
- package/eigen/Eigen/src/Core/Swap.h +70 -38
- package/eigen/Eigen/src/Core/Transpose.h +295 -273
- package/eigen/Eigen/src/Core/Transpositions.h +272 -317
- package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
- package/eigen/Eigen/src/Core/Visitor.h +480 -216
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
- package/eigen/Eigen/src/Core/util/Constants.h +314 -263
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
- package/eigen/Eigen/src/Core/util/Macros.h +939 -646
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
- package/eigen/Eigen/src/Core/util/Meta.h +618 -426
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
- package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
- package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
- package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
- package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
- package/eigen/Eigen/src/Geometry/Transform.h +896 -953
- package/eigen/Eigen/src/Geometry/Translation.h +100 -98
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
- package/eigen/Eigen/src/Householder/Householder.h +104 -122
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
- package/eigen/Eigen/src/LU/Determinant.h +60 -63
- package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
- package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
- package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
- package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
- package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -10,378 +10,130 @@
|
|
|
10
10
|
#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
|
|
11
11
|
#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
19
|
+
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f)
|
|
20
|
+
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d)
|
|
16
21
|
|
|
17
|
-
// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
|
|
18
|
-
#if EIGEN_GNUC_AT_LEAST(5, 3)
|
|
19
|
-
|
|
20
|
-
#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
|
|
21
|
-
const Packet16f p16f_##NAME = pset1<Packet16f>(X)
|
|
22
|
-
|
|
23
|
-
#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
|
|
24
|
-
const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
|
|
25
|
-
|
|
26
|
-
#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
|
|
27
|
-
const Packet8d p8d_##NAME = pset1<Packet8d>(X)
|
|
28
|
-
|
|
29
|
-
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
|
|
30
|
-
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
// Natural logarithm
|
|
34
|
-
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
|
|
35
|
-
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
|
|
36
|
-
// be easily approximated by a polynomial centered on m=1 for stability.
|
|
37
|
-
#if defined(EIGEN_VECTORIZE_AVX512DQ)
|
|
38
22
|
template <>
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
_EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
|
|
45
|
-
|
|
46
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
|
|
47
|
-
|
|
48
|
-
// The smallest non denormalized float number.
|
|
49
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
|
|
50
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
|
|
51
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
|
|
52
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
|
|
53
|
-
|
|
54
|
-
// Polynomial coefficients.
|
|
55
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
|
|
56
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
|
|
57
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
|
|
58
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
|
|
59
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
|
|
60
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
|
|
61
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
|
|
62
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
|
|
63
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
|
|
64
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
|
|
65
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
|
|
66
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
|
|
67
|
-
|
|
68
|
-
// invalid_mask is set to true when x is NaN
|
|
69
|
-
__mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
|
|
70
|
-
__mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
|
|
71
|
-
|
|
72
|
-
// Truncate input values to the minimum positive normal.
|
|
73
|
-
x = pmax(x, p16f_min_norm_pos);
|
|
74
|
-
|
|
75
|
-
// Extract the shifted exponents.
|
|
76
|
-
Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
|
|
77
|
-
Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
|
|
78
|
-
|
|
79
|
-
// Set the exponents to -1, i.e. x are in the range [0.5,1).
|
|
80
|
-
x = _mm512_and_ps(x, p16f_inv_mant_mask);
|
|
81
|
-
x = _mm512_or_ps(x, p16f_half);
|
|
82
|
-
|
|
83
|
-
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
|
|
84
|
-
// and shift by -1. The values are then centered around 0, which improves
|
|
85
|
-
// the stability of the polynomial evaluation.
|
|
86
|
-
// if( x < SQRTHF ) {
|
|
87
|
-
// e -= 1;
|
|
88
|
-
// x = x + x - 1.0;
|
|
89
|
-
// } else { x = x - 1.0; }
|
|
90
|
-
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
|
|
91
|
-
Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
|
|
92
|
-
x = psub(x, p16f_1);
|
|
93
|
-
e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
|
|
94
|
-
x = padd(x, tmp);
|
|
95
|
-
|
|
96
|
-
Packet16f x2 = pmul(x, x);
|
|
97
|
-
Packet16f x3 = pmul(x2, x);
|
|
98
|
-
|
|
99
|
-
// Evaluate the polynomial approximant of degree 8 in three parts, probably
|
|
100
|
-
// to improve instruction-level parallelism.
|
|
101
|
-
Packet16f y, y1, y2;
|
|
102
|
-
y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
|
|
103
|
-
y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
|
|
104
|
-
y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
|
|
105
|
-
y = pmadd(y, x, p16f_cephes_log_p2);
|
|
106
|
-
y1 = pmadd(y1, x, p16f_cephes_log_p5);
|
|
107
|
-
y2 = pmadd(y2, x, p16f_cephes_log_p8);
|
|
108
|
-
y = pmadd(y, x3, y1);
|
|
109
|
-
y = pmadd(y, x3, y2);
|
|
110
|
-
y = pmul(y, x3);
|
|
111
|
-
|
|
112
|
-
// Add the logarithm of the exponent back to the result of the interpolation.
|
|
113
|
-
y1 = pmul(e, p16f_cephes_log_q1);
|
|
114
|
-
tmp = pmul(x2, p16f_half);
|
|
115
|
-
y = padd(y, y1);
|
|
116
|
-
x = psub(x, tmp);
|
|
117
|
-
y2 = pmul(e, p16f_cephes_log_q2);
|
|
118
|
-
x = padd(x, y);
|
|
119
|
-
x = padd(x, y2);
|
|
120
|
-
|
|
121
|
-
__mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
|
|
122
|
-
// Filter out invalid inputs, i.e.:
|
|
123
|
-
// - negative arg will be NAN,
|
|
124
|
-
// - 0 will be -INF.
|
|
125
|
-
// - +INF will be +INF
|
|
126
|
-
return _mm512_mask_blend_ps(iszero_mask,
|
|
127
|
-
_mm512_mask_blend_ps(invalid_mask,
|
|
128
|
-
_mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
|
|
129
|
-
p16f_nan),
|
|
130
|
-
p16f_minus_inf);
|
|
23
|
+
EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
|
|
24
|
+
Packet16f fexponent;
|
|
25
|
+
const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
|
|
26
|
+
exponent = float2half(fexponent);
|
|
27
|
+
return out;
|
|
131
28
|
}
|
|
132
29
|
|
|
133
|
-
#endif
|
|
134
|
-
|
|
135
|
-
// Exponential function. Works by writing "x = m*log(2) + r" where
|
|
136
|
-
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
|
|
137
|
-
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
|
|
138
30
|
template <>
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
|
|
142
|
-
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
|
|
143
|
-
_EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
|
|
144
|
-
|
|
145
|
-
_EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
|
|
146
|
-
_EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
|
|
147
|
-
|
|
148
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
|
|
149
|
-
|
|
150
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
|
|
151
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
|
|
152
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
|
|
153
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
|
|
154
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
|
|
155
|
-
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
|
|
156
|
-
|
|
157
|
-
// Clamp x.
|
|
158
|
-
Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
|
|
159
|
-
|
|
160
|
-
// Express exp(x) as exp(m*ln(2) + r), start by extracting
|
|
161
|
-
// m = floor(x/ln(2) + 0.5).
|
|
162
|
-
Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
|
|
163
|
-
|
|
164
|
-
// Get r = x - m*ln(2). Note that we can do this without losing more than one
|
|
165
|
-
// ulp precision due to the FMA instruction.
|
|
166
|
-
_EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
|
|
167
|
-
Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
|
|
168
|
-
Packet16f r2 = pmul(r, r);
|
|
169
|
-
|
|
170
|
-
// TODO(gonnet): Split into odd/even polynomials and try to exploit
|
|
171
|
-
// instruction-level parallelism.
|
|
172
|
-
Packet16f y = p16f_cephes_exp_p0;
|
|
173
|
-
y = pmadd(y, r, p16f_cephes_exp_p1);
|
|
174
|
-
y = pmadd(y, r, p16f_cephes_exp_p2);
|
|
175
|
-
y = pmadd(y, r, p16f_cephes_exp_p3);
|
|
176
|
-
y = pmadd(y, r, p16f_cephes_exp_p4);
|
|
177
|
-
y = pmadd(y, r, p16f_cephes_exp_p5);
|
|
178
|
-
y = pmadd(y, r2, r);
|
|
179
|
-
y = padd(y, p16f_1);
|
|
180
|
-
|
|
181
|
-
// Build emm0 = 2^m.
|
|
182
|
-
Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
|
|
183
|
-
emm0 = _mm512_slli_epi32(emm0, 23);
|
|
184
|
-
|
|
185
|
-
// Return 2^m * exp(r).
|
|
186
|
-
return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
|
|
31
|
+
EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
|
|
32
|
+
return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
|
|
187
33
|
}
|
|
188
34
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
_EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
|
|
198
|
-
_EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
|
|
199
|
-
|
|
200
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
|
|
201
|
-
|
|
202
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
|
|
203
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
|
|
204
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
|
|
205
|
-
|
|
206
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
|
|
207
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
|
|
208
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
|
|
209
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
|
|
210
|
-
|
|
211
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
|
|
212
|
-
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
|
|
213
|
-
|
|
214
|
-
// clamp x
|
|
215
|
-
x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
|
|
216
|
-
|
|
217
|
-
// Express exp(x) as exp(g + n*log(2)).
|
|
218
|
-
const Packet8d n =
|
|
219
|
-
_mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
|
|
220
|
-
|
|
221
|
-
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
|
|
222
|
-
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
|
|
223
|
-
// digits right.
|
|
224
|
-
const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
|
|
225
|
-
const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
|
|
226
|
-
x = psub(x, nC1);
|
|
227
|
-
x = psub(x, nC2);
|
|
228
|
-
|
|
229
|
-
const Packet8d x2 = pmul(x, x);
|
|
230
|
-
|
|
231
|
-
// Evaluate the numerator polynomial of the rational interpolant.
|
|
232
|
-
Packet8d px = p8d_cephes_exp_p0;
|
|
233
|
-
px = pmadd(px, x2, p8d_cephes_exp_p1);
|
|
234
|
-
px = pmadd(px, x2, p8d_cephes_exp_p2);
|
|
235
|
-
px = pmul(px, x);
|
|
236
|
-
|
|
237
|
-
// Evaluate the denominator polynomial of the rational interpolant.
|
|
238
|
-
Packet8d qx = p8d_cephes_exp_q0;
|
|
239
|
-
qx = pmadd(qx, x2, p8d_cephes_exp_q1);
|
|
240
|
-
qx = pmadd(qx, x2, p8d_cephes_exp_q2);
|
|
241
|
-
qx = pmadd(qx, x2, p8d_cephes_exp_q3);
|
|
242
|
-
|
|
243
|
-
// I don't really get this bit, copied from the SSE2 routines, so...
|
|
244
|
-
// TODO(gonnet): Figure out what is going on here, perhaps find a better
|
|
245
|
-
// rational interpolant?
|
|
246
|
-
x = _mm512_div_pd(px, psub(qx, px));
|
|
247
|
-
x = pmadd(p8d_2, x, p8d_1);
|
|
248
|
-
|
|
249
|
-
// Build e=2^n.
|
|
250
|
-
const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
|
|
251
|
-
_mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
|
|
35
|
+
template <>
|
|
36
|
+
EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
|
|
37
|
+
Packet16f fexponent;
|
|
38
|
+
const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
|
|
39
|
+
exponent = F32ToBf16(fexponent);
|
|
40
|
+
return out;
|
|
41
|
+
}
|
|
252
42
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
return
|
|
256
|
-
|
|
43
|
+
template <>
|
|
44
|
+
EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
|
|
45
|
+
return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
|
|
46
|
+
}
|
|
257
47
|
|
|
258
|
-
// Functions for sqrt.
|
|
259
|
-
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
|
|
260
|
-
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
|
|
261
|
-
// exact solution. The main advantage of this approach is not just speed, but
|
|
262
|
-
// also the fact that it can be inlined and pipelined with other computations,
|
|
263
|
-
// further reducing its effective latency.
|
|
264
48
|
#if EIGEN_FAST_MATH
|
|
265
49
|
template <>
|
|
266
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
267
|
-
|
|
268
|
-
Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
|
|
269
|
-
__mmask16 denormal_mask = _mm512_kand(
|
|
270
|
-
_mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
|
|
271
|
-
_CMP_LT_OQ),
|
|
272
|
-
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
|
|
273
|
-
|
|
274
|
-
Packet16f x = _mm512_rsqrt14_ps(_x);
|
|
275
|
-
|
|
276
|
-
// Do a single step of Newton's iteration.
|
|
277
|
-
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
|
|
278
|
-
|
|
279
|
-
// Flush results for denormals to zero.
|
|
280
|
-
return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
|
|
50
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& x) {
|
|
51
|
+
return generic_sqrt_newton_step<Packet16f>::run(x, _mm512_rsqrt14_ps(x));
|
|
281
52
|
}
|
|
282
53
|
|
|
283
54
|
template <>
|
|
284
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
285
|
-
|
|
286
|
-
Packet8d
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
|
|
291
|
-
|
|
292
|
-
Packet8d x = _mm512_rsqrt14_pd(_x);
|
|
293
|
-
|
|
294
|
-
// Do a single step of Newton's iteration.
|
|
295
|
-
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
|
|
296
|
-
|
|
297
|
-
// Do a second step of Newton's iteration.
|
|
298
|
-
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
|
|
299
|
-
|
|
300
|
-
return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
|
|
55
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& x) {
|
|
56
|
+
#ifdef EIGEN_VECTORIZE_AVX512ER
|
|
57
|
+
return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
|
|
58
|
+
#else
|
|
59
|
+
return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
|
|
60
|
+
#endif
|
|
301
61
|
}
|
|
302
62
|
#else
|
|
303
63
|
template <>
|
|
304
64
|
EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
|
|
305
65
|
return _mm512_sqrt_ps(x);
|
|
306
66
|
}
|
|
67
|
+
|
|
307
68
|
template <>
|
|
308
69
|
EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
|
|
309
70
|
return _mm512_sqrt_pd(x);
|
|
310
71
|
}
|
|
311
72
|
#endif
|
|
312
73
|
|
|
313
|
-
//
|
|
314
|
-
|
|
315
|
-
// and fill in NaN/Inf where needed. Note that this function only exists as an
|
|
316
|
-
// iterative version for doubles since there is no instruction for diretly
|
|
317
|
-
// computing the reciprocal square root in AVX-512.
|
|
318
|
-
#ifdef EIGEN_FAST_MATH
|
|
74
|
+
// prsqrt for float.
|
|
75
|
+
#if defined(EIGEN_VECTORIZE_AVX512ER)
|
|
319
76
|
template <>
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
|
|
323
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
|
|
324
|
-
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
|
|
325
|
-
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
|
|
326
|
-
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
|
|
327
|
-
|
|
328
|
-
Packet16f neg_half = pmul(_x, p16f_minus_half);
|
|
329
|
-
|
|
330
|
-
// select only the inverse sqrt of positive normal inputs (denormals are
|
|
331
|
-
// flushed to zero and cause infs as well).
|
|
332
|
-
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
|
|
333
|
-
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
|
|
334
|
-
|
|
335
|
-
// Fill in NaNs and Infs for the negative/zero entries.
|
|
336
|
-
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
|
|
337
|
-
Packet16f infs_and_nans = _mm512_mask_blend_ps(
|
|
338
|
-
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
|
|
339
|
-
|
|
340
|
-
// Do a single step of Newton's iteration.
|
|
341
|
-
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
|
342
|
-
|
|
343
|
-
// Insert NaNs and Infs in all the right places.
|
|
344
|
-
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
|
|
77
|
+
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
|
|
78
|
+
return _mm512_rsqrt28_ps(x);
|
|
345
79
|
}
|
|
80
|
+
#elif EIGEN_FAST_MATH
|
|
346
81
|
|
|
347
82
|
template <>
|
|
348
|
-
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
|
349
|
-
|
|
350
|
-
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
|
|
351
|
-
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
|
|
352
|
-
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
|
|
353
|
-
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
|
|
354
|
-
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
|
|
355
|
-
|
|
356
|
-
Packet8d neg_half = pmul(_x, p8d_minus_half);
|
|
357
|
-
|
|
358
|
-
// select only the inverse sqrt of positive normal inputs (denormals are
|
|
359
|
-
// flushed to zero and cause infs as well).
|
|
360
|
-
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
|
|
361
|
-
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
|
|
362
|
-
|
|
363
|
-
// Fill in NaNs and Infs for the negative/zero entries.
|
|
364
|
-
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
|
|
365
|
-
Packet8d infs_and_nans = _mm512_mask_blend_pd(
|
|
366
|
-
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
|
|
367
|
-
|
|
368
|
-
// Do a first step of Newton's iteration.
|
|
369
|
-
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
|
370
|
-
|
|
371
|
-
// Do a second step of Newton's iteration.
|
|
372
|
-
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
|
373
|
-
|
|
374
|
-
// Insert NaNs and Infs in all the right places.
|
|
375
|
-
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
|
|
83
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& x) {
|
|
84
|
+
return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(x, _mm512_rsqrt14_ps(x));
|
|
376
85
|
}
|
|
377
|
-
#
|
|
86
|
+
#endif
|
|
87
|
+
|
|
88
|
+
// prsqrt for double.
|
|
89
|
+
#if EIGEN_FAST_MATH
|
|
378
90
|
template <>
|
|
379
|
-
|
|
380
|
-
|
|
91
|
+
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& x) {
|
|
92
|
+
#ifdef EIGEN_VECTORIZE_AVX512ER
|
|
93
|
+
return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
|
|
94
|
+
#else
|
|
95
|
+
return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
|
|
96
|
+
#endif
|
|
381
97
|
}
|
|
98
|
+
|
|
99
|
+
template <>
|
|
100
|
+
EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
|
|
101
|
+
#ifdef EIGEN_VECTORIZE_AVX512ER
|
|
102
|
+
return _mm512_rcp28_ps(a);
|
|
103
|
+
#else
|
|
104
|
+
return generic_reciprocal_newton_step<Packet16f, /*Steps=*/1>::run(a, _mm512_rcp14_ps(a));
|
|
382
105
|
#endif
|
|
106
|
+
}
|
|
383
107
|
#endif
|
|
384
108
|
|
|
109
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
|
|
110
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
|
|
111
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp2)
|
|
112
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
|
|
113
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
|
|
114
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
|
|
115
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
|
|
116
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal)
|
|
117
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
|
|
118
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
|
|
119
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
|
|
120
|
+
BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
|
|
121
|
+
|
|
122
|
+
#ifndef EIGEN_VECTORIZE_AVX512FP16
|
|
123
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
|
|
124
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
|
|
125
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
|
|
126
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
|
|
127
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
|
|
128
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
|
|
129
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
|
|
130
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal)
|
|
131
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
|
|
132
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
|
|
133
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
|
|
134
|
+
F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
|
|
135
|
+
#endif // EIGEN_VECTORIZE_AVX512FP16
|
|
136
|
+
|
|
385
137
|
} // end namespace internal
|
|
386
138
|
|
|
387
139
|
} // end namespace Eigen
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
|
2
|
+
// for linear algebra.
|
|
3
|
+
//
|
|
4
|
+
// Copyright (C) 2025 The Eigen Authors.
|
|
5
|
+
//
|
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
9
|
+
|
|
10
|
+
#ifndef EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
|
|
11
|
+
#define EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
|
|
12
|
+
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
16
|
+
namespace Eigen {
|
|
17
|
+
namespace internal {
|
|
18
|
+
|
|
19
|
+
EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
|
|
20
|
+
__m512i result = _mm512_castsi256_si512(_mm256_castph_si256(a));
|
|
21
|
+
result = _mm512_inserti64x4(result, _mm256_castph_si256(b), 1);
|
|
22
|
+
return _mm512_castsi512_ph(result);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
|
|
26
|
+
a = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_castph_si512(x)));
|
|
27
|
+
b = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(_mm512_castph_si512(x), 1));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
#define _EIGEN_GENERATE_FP16_MATH_FUNCTION(func) \
|
|
31
|
+
template <> \
|
|
32
|
+
EIGEN_STRONG_INLINE Packet8h func<Packet8h>(const Packet8h& a) { \
|
|
33
|
+
return float2half(func(half2float(a))); \
|
|
34
|
+
} \
|
|
35
|
+
\
|
|
36
|
+
template <> \
|
|
37
|
+
EIGEN_STRONG_INLINE Packet16h func<Packet16h>(const Packet16h& a) { \
|
|
38
|
+
return float2half(func(half2float(a))); \
|
|
39
|
+
} \
|
|
40
|
+
\
|
|
41
|
+
template <> \
|
|
42
|
+
EIGEN_STRONG_INLINE Packet32h func<Packet32h>(const Packet32h& a) { \
|
|
43
|
+
Packet16h low; \
|
|
44
|
+
Packet16h high; \
|
|
45
|
+
extract2Packet16h(a, low, high); \
|
|
46
|
+
return combine2Packet16h(func(low), func(high)); \
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(psin)
|
|
50
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(pcos)
|
|
51
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog)
|
|
52
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog2)
|
|
53
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog1p)
|
|
54
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp)
|
|
55
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexpm1)
|
|
56
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp2)
|
|
57
|
+
_EIGEN_GENERATE_FP16_MATH_FUNCTION(ptanh)
|
|
58
|
+
#undef _EIGEN_GENERATE_FP16_MATH_FUNCTION
|
|
59
|
+
|
|
60
|
+
// pfrexp
|
|
61
|
+
template <>
|
|
62
|
+
EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
|
|
63
|
+
return pfrexp_generic(a, exponent);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// pldexp
|
|
67
|
+
template <>
|
|
68
|
+
EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
|
|
69
|
+
return pldexp_generic(a, exponent);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
} // end namespace internal
|
|
73
|
+
} // end namespace Eigen
|
|
74
|
+
|
|
75
|
+
#endif // EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
|