@smake/eigen 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -21
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +235 -326
- package/eigen/Eigen/Eigenvalues +16 -14
- package/eigen/Eigen/Geometry +21 -24
- package/eigen/Eigen/Householder +9 -8
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -14
- package/eigen/Eigen/KLUSupport +43 -0
- package/eigen/Eigen/LU +16 -20
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -54
- package/eigen/Eigen/PaStiXSupport +23 -20
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -21
- package/eigen/Eigen/QtAlignedMalloc +5 -13
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -18
- package/eigen/Eigen/Sparse +1 -4
- package/eigen/Eigen/SparseCholesky +18 -23
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +12 -8
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
- package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
- package/eigen/Eigen/src/Core/Array.h +341 -294
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
- package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
- package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
- package/eigen/Eigen/src/Core/Block.h +375 -398
- package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
- package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
- package/eigen/Eigen/src/Core/DenseBase.h +632 -571
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
- package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +169 -210
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +172 -222
- package/eigen/Eigen/src/Core/EigenBase.h +75 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
- package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
- package/eigen/Eigen/src/Core/IO.h +147 -139
- package/eigen/Eigen/src/Core/IndexedView.h +321 -0
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +56 -66
- package/eigen/Eigen/src/Core/Map.h +124 -142
- package/eigen/Eigen/src/Core/MapBase.h +256 -281
- package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
- package/eigen/Eigen/src/Core/Matrix.h +491 -416
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
- package/eigen/Eigen/src/Core/NestByValue.h +66 -85
- package/eigen/Eigen/src/Core/NoAlias.h +79 -85
- package/eigen/Eigen/src/Core/NumTraits.h +235 -148
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
- package/eigen/Eigen/src/Core/Product.h +260 -139
- package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
- package/eigen/Eigen/src/Core/Random.h +161 -136
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +366 -336
- package/eigen/Eigen/src/Core/Ref.h +308 -209
- package/eigen/Eigen/src/Core/Replicate.h +94 -106
- package/eigen/Eigen/src/Core/Reshaped.h +398 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
- package/eigen/Eigen/src/Core/Reverse.h +136 -145
- package/eigen/Eigen/src/Core/Select.h +70 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +97 -111
- package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
- package/eigen/Eigen/src/Core/SolverBase.h +138 -101
- package/eigen/Eigen/src/Core/StableNorm.h +156 -160
- package/eigen/Eigen/src/Core/StlIterators.h +619 -0
- package/eigen/Eigen/src/Core/Stride.h +91 -88
- package/eigen/Eigen/src/Core/Swap.h +70 -38
- package/eigen/Eigen/src/Core/Transpose.h +295 -273
- package/eigen/Eigen/src/Core/Transpositions.h +272 -317
- package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
- package/eigen/Eigen/src/Core/Visitor.h +480 -216
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
- package/eigen/Eigen/src/Core/util/Constants.h +314 -263
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
- package/eigen/Eigen/src/Core/util/Macros.h +939 -646
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
- package/eigen/Eigen/src/Core/util/Meta.h +618 -426
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
- package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
- package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
- package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
- package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
- package/eigen/Eigen/src/Geometry/Transform.h +896 -953
- package/eigen/Eigen/src/Geometry/Translation.h +100 -98
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
- package/eigen/Eigen/src/Householder/Householder.h +104 -122
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
- package/eigen/Eigen/src/LU/Determinant.h +60 -63
- package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
- package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
- package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
- package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
- package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -0,0 +1,1413 @@
|
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
|
2
|
+
// for linear algebra.
|
|
3
|
+
//
|
|
4
|
+
// Copyright (C) 2025 The Eigen Authors.
|
|
5
|
+
//
|
|
6
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
|
7
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
8
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
9
|
+
|
|
10
|
+
#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
|
|
11
|
+
#define EIGEN_PACKET_MATH_FP16_AVX512_H
|
|
12
|
+
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
16
|
+
namespace Eigen {
|
|
17
|
+
|
|
18
|
+
namespace internal {
|
|
19
|
+
|
|
20
|
+
typedef __m512h Packet32h;
|
|
21
|
+
typedef __m256h Packet16h;
|
|
22
|
+
typedef __m128h Packet8h;
|
|
23
|
+
|
|
24
|
+
template <>
|
|
25
|
+
struct is_arithmetic<Packet8h> {
|
|
26
|
+
enum { value = true };
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
template <>
|
|
30
|
+
struct packet_traits<half> : default_packet_traits {
|
|
31
|
+
typedef Packet32h type;
|
|
32
|
+
typedef Packet16h half;
|
|
33
|
+
enum {
|
|
34
|
+
Vectorizable = 1,
|
|
35
|
+
AlignedOnScalar = 1,
|
|
36
|
+
size = 32,
|
|
37
|
+
|
|
38
|
+
HasCmp = 1,
|
|
39
|
+
HasAdd = 1,
|
|
40
|
+
HasSub = 1,
|
|
41
|
+
HasMul = 1,
|
|
42
|
+
HasDiv = 1,
|
|
43
|
+
HasNegate = 1,
|
|
44
|
+
HasAbs = 1,
|
|
45
|
+
HasAbs2 = 0,
|
|
46
|
+
HasMin = 1,
|
|
47
|
+
HasMax = 1,
|
|
48
|
+
HasConj = 1,
|
|
49
|
+
HasSetLinear = 0,
|
|
50
|
+
HasLog = 1,
|
|
51
|
+
HasLog1p = 1,
|
|
52
|
+
HasExp = 1,
|
|
53
|
+
HasExpm1 = 1,
|
|
54
|
+
HasSqrt = 1,
|
|
55
|
+
HasRsqrt = 1,
|
|
56
|
+
// These ones should be implemented in future
|
|
57
|
+
HasBessel = 0,
|
|
58
|
+
HasNdtri = 0,
|
|
59
|
+
HasSin = EIGEN_FAST_MATH,
|
|
60
|
+
HasCos = EIGEN_FAST_MATH,
|
|
61
|
+
HasTanh = EIGEN_FAST_MATH,
|
|
62
|
+
HasErf = 0, // EIGEN_FAST_MATH,
|
|
63
|
+
HasBlend = 0
|
|
64
|
+
};
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
template <>
|
|
68
|
+
struct unpacket_traits<Packet32h> {
|
|
69
|
+
typedef Eigen::half type;
|
|
70
|
+
typedef Packet16h half;
|
|
71
|
+
typedef Packet32s integer_packet;
|
|
72
|
+
enum {
|
|
73
|
+
size = 32,
|
|
74
|
+
alignment = Aligned64,
|
|
75
|
+
vectorizable = true,
|
|
76
|
+
masked_load_available = false,
|
|
77
|
+
masked_store_available = false
|
|
78
|
+
};
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
template <>
|
|
82
|
+
struct unpacket_traits<Packet16h> {
|
|
83
|
+
typedef Eigen::half type;
|
|
84
|
+
typedef Packet8h half;
|
|
85
|
+
typedef Packet16s integer_packet;
|
|
86
|
+
enum {
|
|
87
|
+
size = 16,
|
|
88
|
+
alignment = Aligned32,
|
|
89
|
+
vectorizable = true,
|
|
90
|
+
masked_load_available = false,
|
|
91
|
+
masked_store_available = false
|
|
92
|
+
};
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
template <>
|
|
96
|
+
struct unpacket_traits<Packet8h> {
|
|
97
|
+
typedef Eigen::half type;
|
|
98
|
+
typedef Packet8h half;
|
|
99
|
+
typedef Packet8s integer_packet;
|
|
100
|
+
enum {
|
|
101
|
+
size = 8,
|
|
102
|
+
alignment = Aligned16,
|
|
103
|
+
vectorizable = true,
|
|
104
|
+
masked_load_available = false,
|
|
105
|
+
masked_store_available = false
|
|
106
|
+
};
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
// Conversions
|
|
110
|
+
|
|
111
|
+
EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); }
|
|
112
|
+
|
|
113
|
+
EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); }
|
|
114
|
+
|
|
115
|
+
EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); }
|
|
116
|
+
|
|
117
|
+
EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); }
|
|
118
|
+
|
|
119
|
+
// Memory functions
|
|
120
|
+
|
|
121
|
+
// pset1
|
|
122
|
+
|
|
123
|
+
template <>
|
|
124
|
+
EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
|
|
125
|
+
return _mm512_set1_ph(from.x);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
template <>
|
|
129
|
+
EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
|
|
130
|
+
return _mm256_set1_ph(from.x);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
template <>
|
|
134
|
+
EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
|
|
135
|
+
return _mm_set1_ph(from.x);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
template <>
|
|
139
|
+
EIGEN_STRONG_INLINE Packet32h pzero(const Packet32h& /*a*/) {
|
|
140
|
+
return _mm512_setzero_ph();
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
template <>
|
|
144
|
+
EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) {
|
|
145
|
+
return _mm256_setzero_ph();
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
template <>
|
|
149
|
+
EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) {
|
|
150
|
+
return _mm_setzero_ph();
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// pset1frombits
|
|
154
|
+
template <>
|
|
155
|
+
EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
|
|
156
|
+
return _mm512_castsi512_ph(_mm512_set1_epi16(from));
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
template <>
|
|
160
|
+
EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) {
|
|
161
|
+
return _mm256_castsi256_ph(_mm256_set1_epi16(from));
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
template <>
|
|
165
|
+
EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) {
|
|
166
|
+
return _mm_castsi128_ph(_mm_set1_epi16(from));
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// pfirst
|
|
170
|
+
|
|
171
|
+
template <>
|
|
172
|
+
EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
|
|
173
|
+
return Eigen::half(_mm512_cvtsh_h(from));
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
template <>
|
|
177
|
+
EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
|
|
178
|
+
return Eigen::half(_mm256_cvtsh_h(from));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
template <>
|
|
182
|
+
EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
|
|
183
|
+
return Eigen::half(_mm_cvtsh_h(from));
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// pload
|
|
187
|
+
|
|
188
|
+
template <>
|
|
189
|
+
EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
|
|
190
|
+
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
template <>
|
|
194
|
+
EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
|
|
195
|
+
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
template <>
|
|
199
|
+
EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
|
|
200
|
+
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// ploadu
|
|
204
|
+
|
|
205
|
+
template <>
|
|
206
|
+
EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
|
|
207
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
template <>
|
|
211
|
+
EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
|
|
212
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
template <>
|
|
216
|
+
EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
|
|
217
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// pstore
|
|
221
|
+
|
|
222
|
+
template <>
|
|
223
|
+
EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
|
|
224
|
+
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
template <>
|
|
228
|
+
EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
|
|
229
|
+
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
template <>
|
|
233
|
+
EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) {
|
|
234
|
+
EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// pstoreu
|
|
238
|
+
|
|
239
|
+
template <>
|
|
240
|
+
EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
|
|
241
|
+
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
template <>
|
|
245
|
+
EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
|
|
246
|
+
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
template <>
|
|
250
|
+
EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) {
|
|
251
|
+
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// ploaddup
|
|
255
|
+
template <>
|
|
256
|
+
EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
|
|
257
|
+
__m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
|
|
258
|
+
return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
|
|
259
|
+
5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
|
|
260
|
+
a);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
template <>
|
|
264
|
+
EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
|
|
265
|
+
__m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from));
|
|
266
|
+
return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
template <>
|
|
270
|
+
EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
|
|
271
|
+
return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// ploadquad
|
|
275
|
+
template <>
|
|
276
|
+
EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
|
|
277
|
+
__m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
|
|
278
|
+
return _mm512_permutexvar_ph(
|
|
279
|
+
_mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
|
|
280
|
+
a);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
template <>
|
|
284
|
+
EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) {
|
|
285
|
+
return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x,
|
|
286
|
+
from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
template <>
|
|
290
|
+
EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
|
|
291
|
+
return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// pabs
|
|
295
|
+
|
|
296
|
+
template <>
|
|
297
|
+
EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
|
|
298
|
+
return _mm512_abs_ph(a);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
template <>
|
|
302
|
+
EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) {
|
|
303
|
+
return _mm256_abs_ph(a);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
template <>
|
|
307
|
+
EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) {
|
|
308
|
+
return _mm_abs_ph(a);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// psignbit
|
|
312
|
+
|
|
313
|
+
template <>
|
|
314
|
+
EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
|
|
315
|
+
return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
template <>
|
|
319
|
+
EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) {
|
|
320
|
+
return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15));
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
template <>
|
|
324
|
+
EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) {
|
|
325
|
+
return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15));
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// pmin
|
|
329
|
+
|
|
330
|
+
template <>
|
|
331
|
+
EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
|
|
332
|
+
return _mm512_min_ph(a, b);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
template <>
|
|
336
|
+
EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
337
|
+
return _mm256_min_ph(a, b);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
template <>
|
|
341
|
+
EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
|
342
|
+
return _mm_min_ph(a, b);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// pmax
|
|
346
|
+
|
|
347
|
+
template <>
|
|
348
|
+
EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
|
|
349
|
+
return _mm512_max_ph(a, b);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
template <>
|
|
353
|
+
EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
354
|
+
return _mm256_max_ph(a, b);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
template <>
|
|
358
|
+
EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
|
359
|
+
return _mm_max_ph(a, b);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// plset
|
|
363
|
+
template <>
|
|
364
|
+
EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
|
|
365
|
+
return _mm512_add_ph(pset1<Packet32h>(a), _mm512_set_ph(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
|
366
|
+
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
template <>
|
|
370
|
+
EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
|
|
371
|
+
return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
template <>
|
|
375
|
+
EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
|
|
376
|
+
return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0));
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// por
|
|
380
|
+
|
|
381
|
+
template <>
|
|
382
|
+
EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
|
|
383
|
+
return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
template <>
|
|
387
|
+
EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
|
|
388
|
+
return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
template <>
|
|
392
|
+
EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
|
|
393
|
+
return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// pxor
|
|
397
|
+
|
|
398
|
+
template <>
|
|
399
|
+
EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
|
|
400
|
+
return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
template <>
|
|
404
|
+
EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
|
|
405
|
+
return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
template <>
|
|
409
|
+
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
|
|
410
|
+
return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// pand
|
|
414
|
+
|
|
415
|
+
template <>
|
|
416
|
+
EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
|
|
417
|
+
return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
template <>
|
|
421
|
+
EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
|
|
422
|
+
return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
template <>
|
|
426
|
+
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
|
|
427
|
+
return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// pandnot
|
|
431
|
+
|
|
432
|
+
template <>
|
|
433
|
+
EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
|
|
434
|
+
return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
template <>
|
|
438
|
+
EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
|
|
439
|
+
return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a)));
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
template <>
|
|
443
|
+
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
|
|
444
|
+
return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a)));
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// pselect
|
|
448
|
+
|
|
449
|
+
template <>
|
|
450
|
+
EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
|
|
451
|
+
__mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
|
|
452
|
+
return _mm512_mask_blend_ph(mask32, a, b);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
template <>
|
|
456
|
+
EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
|
|
457
|
+
__mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ);
|
|
458
|
+
return _mm256_mask_blend_ph(mask16, a, b);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
template <>
|
|
462
|
+
EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
|
|
463
|
+
__mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ);
|
|
464
|
+
return _mm_mask_blend_ph(mask8, a, b);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// pcmp_eq
|
|
468
|
+
|
|
469
|
+
template <>
|
|
470
|
+
EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
|
|
471
|
+
__mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
|
|
472
|
+
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
template <>
|
|
476
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
|
|
477
|
+
__mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
|
|
478
|
+
return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
template <>
|
|
482
|
+
EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
|
|
483
|
+
__mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
|
|
484
|
+
return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// pcmp_le
|
|
488
|
+
|
|
489
|
+
template <>
|
|
490
|
+
EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
|
|
491
|
+
__mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
|
|
492
|
+
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
template <>
|
|
496
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
|
|
497
|
+
__mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
|
|
498
|
+
return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
template <>
|
|
502
|
+
EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
|
|
503
|
+
__mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
|
|
504
|
+
return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// pcmp_lt
|
|
508
|
+
|
|
509
|
+
template <>
|
|
510
|
+
EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
|
|
511
|
+
__mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
|
|
512
|
+
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
template <>
|
|
516
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
|
|
517
|
+
__mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
|
|
518
|
+
return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
template <>
|
|
522
|
+
EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
|
|
523
|
+
__mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
|
|
524
|
+
return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// pcmp_lt_or_nan
|
|
528
|
+
|
|
529
|
+
template <>
|
|
530
|
+
EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
|
|
531
|
+
__mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
|
|
532
|
+
return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
template <>
|
|
536
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
|
|
537
|
+
__mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
|
|
538
|
+
return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
template <>
|
|
542
|
+
EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
|
|
543
|
+
__mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
|
|
544
|
+
return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
// padd
|
|
548
|
+
|
|
549
|
+
template <>
|
|
550
|
+
EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
|
|
551
|
+
return _mm512_add_ph(a, b);
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
template <>
|
|
555
|
+
EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
556
|
+
return _mm256_add_ph(a, b);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
template <>
|
|
560
|
+
EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
|
561
|
+
return _mm_add_ph(a, b);
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// psub
|
|
565
|
+
|
|
566
|
+
template <>
|
|
567
|
+
EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
|
|
568
|
+
return _mm512_sub_ph(a, b);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
template <>
|
|
572
|
+
EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
573
|
+
return _mm256_sub_ph(a, b);
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
template <>
|
|
577
|
+
EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
|
578
|
+
return _mm_sub_ph(a, b);
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// pmul
|
|
582
|
+
|
|
583
|
+
template <>
|
|
584
|
+
EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
|
|
585
|
+
return _mm512_mul_ph(a, b);
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
template <>
|
|
589
|
+
EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
590
|
+
return _mm256_mul_ph(a, b);
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
template <>
|
|
594
|
+
EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
|
595
|
+
return _mm_mul_ph(a, b);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// pdiv
|
|
599
|
+
|
|
600
|
+
template <>
|
|
601
|
+
EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
|
|
602
|
+
return _mm512_div_ph(a, b);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
template <>
|
|
606
|
+
EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
607
|
+
return _mm256_div_ph(a, b);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
template <>
|
|
611
|
+
EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
|
612
|
+
return _mm_div_ph(a, b);
|
|
613
|
+
;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// pround
|
|
617
|
+
|
|
618
|
+
template <>
|
|
619
|
+
EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
|
|
620
|
+
// Work-around for default std::round rounding mode.
|
|
621
|
+
|
|
622
|
+
// Mask for the sign bit.
|
|
623
|
+
const Packet32h signMask =
|
|
624
|
+
pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
|
|
625
|
+
// The largest half-precision float less than 0.5.
|
|
626
|
+
const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
|
|
627
|
+
|
|
628
|
+
return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
template <>
|
|
632
|
+
EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
|
|
633
|
+
// Work-around for default std::round rounding mode.
|
|
634
|
+
|
|
635
|
+
// Mask for the sign bit.
|
|
636
|
+
const Packet16h signMask =
|
|
637
|
+
pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
|
|
638
|
+
// The largest half-precision float less than 0.5.
|
|
639
|
+
const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu));
|
|
640
|
+
|
|
641
|
+
return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
template <>
|
|
645
|
+
EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
|
|
646
|
+
// Work-around for default std::round rounding mode.
|
|
647
|
+
|
|
648
|
+
// Mask for the sign bit.
|
|
649
|
+
const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
|
|
650
|
+
// The largest half-precision float less than 0.5.
|
|
651
|
+
const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu));
|
|
652
|
+
|
|
653
|
+
return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
// print
|
|
657
|
+
|
|
658
|
+
template <>
|
|
659
|
+
EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
|
|
660
|
+
return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
template <>
|
|
664
|
+
EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
|
|
665
|
+
return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
template <>
|
|
669
|
+
EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
|
|
670
|
+
return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// pceil
|
|
674
|
+
|
|
675
|
+
template <>
|
|
676
|
+
EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
|
|
677
|
+
return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
template <>
|
|
681
|
+
EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
|
|
682
|
+
return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
template <>
|
|
686
|
+
EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
|
|
687
|
+
return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
// pfloor
|
|
691
|
+
|
|
692
|
+
template <>
|
|
693
|
+
EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
|
|
694
|
+
return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
template <>
|
|
698
|
+
EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
|
|
699
|
+
return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
template <>
|
|
703
|
+
EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
|
|
704
|
+
return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
// ptrunc
|
|
708
|
+
|
|
709
|
+
template <>
|
|
710
|
+
EIGEN_STRONG_INLINE Packet32h ptrunc<Packet32h>(const Packet32h& a) {
|
|
711
|
+
return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
template <>
|
|
715
|
+
EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
|
|
716
|
+
return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
template <>
|
|
720
|
+
EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
|
|
721
|
+
return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO);
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
// predux
|
|
725
|
+
template <>
|
|
726
|
+
EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
|
|
727
|
+
return half(_mm512_reduce_add_ph(a));
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
template <>
|
|
731
|
+
EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
|
|
732
|
+
return half(_mm256_reduce_add_ph(a));
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
template <>
|
|
736
|
+
EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
|
|
737
|
+
return half(_mm_reduce_add_ph(a));
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// predux_half_dowto4
|
|
741
|
+
template <>
|
|
742
|
+
EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
|
|
743
|
+
const __m512i bits = _mm512_castph_si512(a);
|
|
744
|
+
Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
|
|
745
|
+
Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
|
|
746
|
+
return padd(lo, hi);
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
template <>
|
|
750
|
+
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
|
|
751
|
+
Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
|
|
752
|
+
Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
|
|
753
|
+
return padd(lo, hi);
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// predux_max
|
|
757
|
+
|
|
758
|
+
template <>
|
|
759
|
+
EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) {
|
|
760
|
+
return half(_mm512_reduce_max_ph(a));
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
template <>
|
|
764
|
+
EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) {
|
|
765
|
+
return half(_mm256_reduce_max_ph(a));
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
template <>
|
|
769
|
+
EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) {
|
|
770
|
+
return half(_mm_reduce_max_ph(a));
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
// predux_min
|
|
774
|
+
|
|
775
|
+
template <>
|
|
776
|
+
EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) {
|
|
777
|
+
return half(_mm512_reduce_min_ph(a));
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
template <>
|
|
781
|
+
EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) {
|
|
782
|
+
return half(_mm256_reduce_min_ph(a));
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
template <>
|
|
786
|
+
EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) {
|
|
787
|
+
return half(_mm_reduce_min_ph(a));
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
// predux_mul
|
|
791
|
+
|
|
792
|
+
template <>
|
|
793
|
+
EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) {
|
|
794
|
+
return half(_mm512_reduce_mul_ph(a));
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
template <>
|
|
798
|
+
EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) {
|
|
799
|
+
return half(_mm256_reduce_mul_ph(a));
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
template <>
|
|
803
|
+
EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) {
|
|
804
|
+
return half(_mm_reduce_mul_ph(a));
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
#ifdef EIGEN_VECTORIZE_FMA
|
|
808
|
+
|
|
809
|
+
// pmadd
|
|
810
|
+
|
|
811
|
+
template <>
|
|
812
|
+
EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
|
|
813
|
+
return _mm512_fmadd_ph(a, b, c);
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
template <>
|
|
817
|
+
EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
818
|
+
return _mm256_fmadd_ph(a, b, c);
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
template <>
|
|
822
|
+
EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
|
|
823
|
+
return _mm_fmadd_ph(a, b, c);
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
// pmsub
|
|
827
|
+
|
|
828
|
+
template <>
|
|
829
|
+
EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
|
|
830
|
+
return _mm512_fmsub_ph(a, b, c);
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
template <>
|
|
834
|
+
EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
835
|
+
return _mm256_fmsub_ph(a, b, c);
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
template <>
|
|
839
|
+
EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
|
|
840
|
+
return _mm_fmsub_ph(a, b, c);
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// pnmadd
|
|
844
|
+
|
|
845
|
+
template <>
|
|
846
|
+
EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
|
|
847
|
+
return _mm512_fnmadd_ph(a, b, c);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
template <>
|
|
851
|
+
EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
852
|
+
return _mm256_fnmadd_ph(a, b, c);
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
template <>
|
|
856
|
+
EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
|
|
857
|
+
return _mm_fnmadd_ph(a, b, c);
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
// pnmsub
|
|
861
|
+
|
|
862
|
+
template <>
|
|
863
|
+
EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
|
|
864
|
+
return _mm512_fnmsub_ph(a, b, c);
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
template <>
|
|
868
|
+
EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
869
|
+
return _mm256_fnmsub_ph(a, b, c);
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
template <>
|
|
873
|
+
EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
|
|
874
|
+
return _mm_fnmsub_ph(a, b, c);
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
#endif
|
|
878
|
+
|
|
879
|
+
// pnegate
|
|
880
|
+
|
|
881
|
+
template <>
|
|
882
|
+
EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
|
|
883
|
+
return _mm512_castsi512_ph(
|
|
884
|
+
_mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
template <>
|
|
888
|
+
EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
|
|
889
|
+
return _mm256_castsi256_ph(
|
|
890
|
+
_mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
template <>
|
|
894
|
+
EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
|
|
895
|
+
return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
// pconj
|
|
899
|
+
|
|
900
|
+
// Nothing, packets are real.
|
|
901
|
+
|
|
902
|
+
// psqrt
|
|
903
|
+
|
|
904
|
+
template <>
|
|
905
|
+
EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
|
|
906
|
+
return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a));
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
template <>
|
|
910
|
+
EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) {
|
|
911
|
+
return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a));
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
template <>
|
|
915
|
+
EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) {
|
|
916
|
+
return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a));
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
// prsqrt
|
|
920
|
+
|
|
921
|
+
template <>
|
|
922
|
+
EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
|
|
923
|
+
return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a));
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
template <>
|
|
927
|
+
EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) {
|
|
928
|
+
return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a));
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
template <>
|
|
932
|
+
EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) {
|
|
933
|
+
return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a));
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
// preciprocal
|
|
937
|
+
|
|
938
|
+
template <>
|
|
939
|
+
EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
|
|
940
|
+
return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a));
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
template <>
|
|
944
|
+
EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) {
|
|
945
|
+
return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a));
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
template <>
|
|
949
|
+
EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) {
|
|
950
|
+
return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a));
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
// ptranspose
|
|
954
|
+
|
|
955
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
|
|
956
|
+
__m512i t[32];
|
|
957
|
+
|
|
958
|
+
EIGEN_UNROLL_LOOP
|
|
959
|
+
for (int i = 0; i < 16; i++) {
|
|
960
|
+
t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
|
|
961
|
+
t[2 * i + 1] =
|
|
962
|
+
_mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
__m512i p[32];
|
|
966
|
+
|
|
967
|
+
EIGEN_UNROLL_LOOP
|
|
968
|
+
for (int i = 0; i < 8; i++) {
|
|
969
|
+
p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
|
|
970
|
+
p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
|
|
971
|
+
p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
|
|
972
|
+
p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
__m512i q[32];
|
|
976
|
+
|
|
977
|
+
EIGEN_UNROLL_LOOP
|
|
978
|
+
for (int i = 0; i < 4; i++) {
|
|
979
|
+
q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
|
|
980
|
+
q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
|
|
981
|
+
q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
|
|
982
|
+
q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
|
|
983
|
+
q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
|
|
984
|
+
q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
|
|
985
|
+
q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
|
|
986
|
+
q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
__m512i f[32];
|
|
990
|
+
|
|
991
|
+
#define PACKET32H_TRANSPOSE_HELPER(X, Y) \
|
|
992
|
+
do { \
|
|
993
|
+
f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
|
|
994
|
+
f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
|
|
995
|
+
f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
|
|
996
|
+
f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
|
|
997
|
+
f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
|
|
998
|
+
f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
|
|
999
|
+
f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
|
|
1000
|
+
f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
|
|
1001
|
+
} while (false);
|
|
1002
|
+
|
|
1003
|
+
PACKET32H_TRANSPOSE_HELPER(0, 0);
|
|
1004
|
+
PACKET32H_TRANSPOSE_HELPER(1, 1);
|
|
1005
|
+
PACKET32H_TRANSPOSE_HELPER(2, 2);
|
|
1006
|
+
PACKET32H_TRANSPOSE_HELPER(3, 3);
|
|
1007
|
+
|
|
1008
|
+
PACKET32H_TRANSPOSE_HELPER(1, 0);
|
|
1009
|
+
PACKET32H_TRANSPOSE_HELPER(2, 0);
|
|
1010
|
+
PACKET32H_TRANSPOSE_HELPER(3, 0);
|
|
1011
|
+
PACKET32H_TRANSPOSE_HELPER(2, 1);
|
|
1012
|
+
PACKET32H_TRANSPOSE_HELPER(3, 1);
|
|
1013
|
+
PACKET32H_TRANSPOSE_HELPER(3, 2);
|
|
1014
|
+
|
|
1015
|
+
PACKET32H_TRANSPOSE_HELPER(0, 1);
|
|
1016
|
+
PACKET32H_TRANSPOSE_HELPER(0, 2);
|
|
1017
|
+
PACKET32H_TRANSPOSE_HELPER(0, 3);
|
|
1018
|
+
PACKET32H_TRANSPOSE_HELPER(1, 2);
|
|
1019
|
+
PACKET32H_TRANSPOSE_HELPER(1, 3);
|
|
1020
|
+
PACKET32H_TRANSPOSE_HELPER(2, 3);
|
|
1021
|
+
|
|
1022
|
+
#undef PACKET32H_TRANSPOSE_HELPER
|
|
1023
|
+
|
|
1024
|
+
EIGEN_UNROLL_LOOP
|
|
1025
|
+
for (int i = 0; i < 32; i++) {
|
|
1026
|
+
a.packet[i] = _mm512_castsi512_ph(f[i]);
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
|
|
1031
|
+
__m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
|
|
1032
|
+
t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
|
|
1033
|
+
t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
|
|
1034
|
+
t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
|
|
1035
|
+
t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
|
|
1036
|
+
|
|
1037
|
+
p0 = _mm512_unpacklo_epi32(t0, t2);
|
|
1038
|
+
p1 = _mm512_unpackhi_epi32(t0, t2);
|
|
1039
|
+
p2 = _mm512_unpacklo_epi32(t1, t3);
|
|
1040
|
+
p3 = _mm512_unpackhi_epi32(t1, t3);
|
|
1041
|
+
|
|
1042
|
+
a0 = p0;
|
|
1043
|
+
a1 = p1;
|
|
1044
|
+
a2 = p2;
|
|
1045
|
+
a3 = p3;
|
|
1046
|
+
|
|
1047
|
+
a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
|
|
1048
|
+
a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
|
|
1049
|
+
|
|
1050
|
+
a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
|
|
1051
|
+
a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
|
|
1052
|
+
|
|
1053
|
+
a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
|
|
1054
|
+
a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
|
|
1055
|
+
|
|
1056
|
+
a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
|
|
1057
|
+
a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
|
|
1058
|
+
|
|
1059
|
+
a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
|
|
1060
|
+
a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
|
|
1061
|
+
|
|
1062
|
+
a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
|
|
1063
|
+
a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
|
|
1064
|
+
|
|
1065
|
+
a.packet[0] = _mm512_castsi512_ph(a0);
|
|
1066
|
+
a.packet[1] = _mm512_castsi512_ph(a1);
|
|
1067
|
+
a.packet[2] = _mm512_castsi512_ph(a2);
|
|
1068
|
+
a.packet[3] = _mm512_castsi512_ph(a3);
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
|
|
1072
|
+
__m256i a = _mm256_castph_si256(kernel.packet[0]);
|
|
1073
|
+
__m256i b = _mm256_castph_si256(kernel.packet[1]);
|
|
1074
|
+
__m256i c = _mm256_castph_si256(kernel.packet[2]);
|
|
1075
|
+
__m256i d = _mm256_castph_si256(kernel.packet[3]);
|
|
1076
|
+
__m256i e = _mm256_castph_si256(kernel.packet[4]);
|
|
1077
|
+
__m256i f = _mm256_castph_si256(kernel.packet[5]);
|
|
1078
|
+
__m256i g = _mm256_castph_si256(kernel.packet[6]);
|
|
1079
|
+
__m256i h = _mm256_castph_si256(kernel.packet[7]);
|
|
1080
|
+
__m256i i = _mm256_castph_si256(kernel.packet[8]);
|
|
1081
|
+
__m256i j = _mm256_castph_si256(kernel.packet[9]);
|
|
1082
|
+
__m256i k = _mm256_castph_si256(kernel.packet[10]);
|
|
1083
|
+
__m256i l = _mm256_castph_si256(kernel.packet[11]);
|
|
1084
|
+
__m256i m = _mm256_castph_si256(kernel.packet[12]);
|
|
1085
|
+
__m256i n = _mm256_castph_si256(kernel.packet[13]);
|
|
1086
|
+
__m256i o = _mm256_castph_si256(kernel.packet[14]);
|
|
1087
|
+
__m256i p = _mm256_castph_si256(kernel.packet[15]);
|
|
1088
|
+
|
|
1089
|
+
__m256i ab_07 = _mm256_unpacklo_epi16(a, b);
|
|
1090
|
+
__m256i cd_07 = _mm256_unpacklo_epi16(c, d);
|
|
1091
|
+
__m256i ef_07 = _mm256_unpacklo_epi16(e, f);
|
|
1092
|
+
__m256i gh_07 = _mm256_unpacklo_epi16(g, h);
|
|
1093
|
+
__m256i ij_07 = _mm256_unpacklo_epi16(i, j);
|
|
1094
|
+
__m256i kl_07 = _mm256_unpacklo_epi16(k, l);
|
|
1095
|
+
__m256i mn_07 = _mm256_unpacklo_epi16(m, n);
|
|
1096
|
+
__m256i op_07 = _mm256_unpacklo_epi16(o, p);
|
|
1097
|
+
|
|
1098
|
+
__m256i ab_8f = _mm256_unpackhi_epi16(a, b);
|
|
1099
|
+
__m256i cd_8f = _mm256_unpackhi_epi16(c, d);
|
|
1100
|
+
__m256i ef_8f = _mm256_unpackhi_epi16(e, f);
|
|
1101
|
+
__m256i gh_8f = _mm256_unpackhi_epi16(g, h);
|
|
1102
|
+
__m256i ij_8f = _mm256_unpackhi_epi16(i, j);
|
|
1103
|
+
__m256i kl_8f = _mm256_unpackhi_epi16(k, l);
|
|
1104
|
+
__m256i mn_8f = _mm256_unpackhi_epi16(m, n);
|
|
1105
|
+
__m256i op_8f = _mm256_unpackhi_epi16(o, p);
|
|
1106
|
+
|
|
1107
|
+
__m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
|
|
1108
|
+
__m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
|
|
1109
|
+
__m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
|
|
1110
|
+
__m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
|
|
1111
|
+
__m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
|
|
1112
|
+
__m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
|
|
1113
|
+
__m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
|
|
1114
|
+
__m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
|
|
1115
|
+
|
|
1116
|
+
__m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
|
|
1117
|
+
__m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
|
|
1118
|
+
__m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
|
|
1119
|
+
__m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
|
|
1120
|
+
__m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
|
|
1121
|
+
__m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
|
|
1122
|
+
__m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
|
|
1123
|
+
__m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
|
|
1124
|
+
|
|
1125
|
+
__m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
|
|
1126
|
+
__m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
|
|
1127
|
+
__m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
|
|
1128
|
+
__m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
|
|
1129
|
+
__m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
|
|
1130
|
+
__m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
|
|
1131
|
+
__m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
|
|
1132
|
+
__m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
|
|
1133
|
+
__m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
|
|
1134
|
+
__m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
|
|
1135
|
+
__m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
|
|
1136
|
+
__m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
|
|
1137
|
+
__m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
|
|
1138
|
+
__m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
|
|
1139
|
+
__m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
|
|
1140
|
+
__m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
|
|
1141
|
+
|
|
1142
|
+
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
|
|
1143
|
+
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
|
|
1144
|
+
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
|
|
1145
|
+
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
|
|
1146
|
+
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
|
|
1147
|
+
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
|
|
1148
|
+
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
|
|
1149
|
+
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
|
|
1150
|
+
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
|
|
1151
|
+
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
|
|
1152
|
+
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
|
|
1153
|
+
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
|
|
1154
|
+
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
|
|
1155
|
+
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
|
|
1156
|
+
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
|
|
1157
|
+
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
|
|
1158
|
+
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
|
|
1159
|
+
|
|
1160
|
+
kernel.packet[0] = _mm256_castsi256_ph(a_p_0);
|
|
1161
|
+
kernel.packet[1] = _mm256_castsi256_ph(a_p_1);
|
|
1162
|
+
kernel.packet[2] = _mm256_castsi256_ph(a_p_2);
|
|
1163
|
+
kernel.packet[3] = _mm256_castsi256_ph(a_p_3);
|
|
1164
|
+
kernel.packet[4] = _mm256_castsi256_ph(a_p_4);
|
|
1165
|
+
kernel.packet[5] = _mm256_castsi256_ph(a_p_5);
|
|
1166
|
+
kernel.packet[6] = _mm256_castsi256_ph(a_p_6);
|
|
1167
|
+
kernel.packet[7] = _mm256_castsi256_ph(a_p_7);
|
|
1168
|
+
kernel.packet[8] = _mm256_castsi256_ph(a_p_8);
|
|
1169
|
+
kernel.packet[9] = _mm256_castsi256_ph(a_p_9);
|
|
1170
|
+
kernel.packet[10] = _mm256_castsi256_ph(a_p_a);
|
|
1171
|
+
kernel.packet[11] = _mm256_castsi256_ph(a_p_b);
|
|
1172
|
+
kernel.packet[12] = _mm256_castsi256_ph(a_p_c);
|
|
1173
|
+
kernel.packet[13] = _mm256_castsi256_ph(a_p_d);
|
|
1174
|
+
kernel.packet[14] = _mm256_castsi256_ph(a_p_e);
|
|
1175
|
+
kernel.packet[15] = _mm256_castsi256_ph(a_p_f);
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
|
|
1179
|
+
EIGEN_ALIGN64 half in[8][16];
|
|
1180
|
+
pstore<half>(in[0], kernel.packet[0]);
|
|
1181
|
+
pstore<half>(in[1], kernel.packet[1]);
|
|
1182
|
+
pstore<half>(in[2], kernel.packet[2]);
|
|
1183
|
+
pstore<half>(in[3], kernel.packet[3]);
|
|
1184
|
+
pstore<half>(in[4], kernel.packet[4]);
|
|
1185
|
+
pstore<half>(in[5], kernel.packet[5]);
|
|
1186
|
+
pstore<half>(in[6], kernel.packet[6]);
|
|
1187
|
+
pstore<half>(in[7], kernel.packet[7]);
|
|
1188
|
+
|
|
1189
|
+
EIGEN_ALIGN64 half out[8][16];
|
|
1190
|
+
|
|
1191
|
+
for (int i = 0; i < 8; ++i) {
|
|
1192
|
+
for (int j = 0; j < 8; ++j) {
|
|
1193
|
+
out[i][j] = in[j][2 * i];
|
|
1194
|
+
}
|
|
1195
|
+
for (int j = 0; j < 8; ++j) {
|
|
1196
|
+
out[i][j + 8] = in[j][2 * i + 1];
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
kernel.packet[0] = pload<Packet16h>(out[0]);
|
|
1201
|
+
kernel.packet[1] = pload<Packet16h>(out[1]);
|
|
1202
|
+
kernel.packet[2] = pload<Packet16h>(out[2]);
|
|
1203
|
+
kernel.packet[3] = pload<Packet16h>(out[3]);
|
|
1204
|
+
kernel.packet[4] = pload<Packet16h>(out[4]);
|
|
1205
|
+
kernel.packet[5] = pload<Packet16h>(out[5]);
|
|
1206
|
+
kernel.packet[6] = pload<Packet16h>(out[6]);
|
|
1207
|
+
kernel.packet[7] = pload<Packet16h>(out[7]);
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
|
|
1211
|
+
EIGEN_ALIGN64 half in[4][16];
|
|
1212
|
+
pstore<half>(in[0], kernel.packet[0]);
|
|
1213
|
+
pstore<half>(in[1], kernel.packet[1]);
|
|
1214
|
+
pstore<half>(in[2], kernel.packet[2]);
|
|
1215
|
+
pstore<half>(in[3], kernel.packet[3]);
|
|
1216
|
+
|
|
1217
|
+
EIGEN_ALIGN64 half out[4][16];
|
|
1218
|
+
|
|
1219
|
+
for (int i = 0; i < 4; ++i) {
|
|
1220
|
+
for (int j = 0; j < 4; ++j) {
|
|
1221
|
+
out[i][j] = in[j][4 * i];
|
|
1222
|
+
}
|
|
1223
|
+
for (int j = 0; j < 4; ++j) {
|
|
1224
|
+
out[i][j + 4] = in[j][4 * i + 1];
|
|
1225
|
+
}
|
|
1226
|
+
for (int j = 0; j < 4; ++j) {
|
|
1227
|
+
out[i][j + 8] = in[j][4 * i + 2];
|
|
1228
|
+
}
|
|
1229
|
+
for (int j = 0; j < 4; ++j) {
|
|
1230
|
+
out[i][j + 12] = in[j][4 * i + 3];
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
kernel.packet[0] = pload<Packet16h>(out[0]);
|
|
1235
|
+
kernel.packet[1] = pload<Packet16h>(out[1]);
|
|
1236
|
+
kernel.packet[2] = pload<Packet16h>(out[2]);
|
|
1237
|
+
kernel.packet[3] = pload<Packet16h>(out[3]);
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
|
|
1241
|
+
__m128i a = _mm_castph_si128(kernel.packet[0]);
|
|
1242
|
+
__m128i b = _mm_castph_si128(kernel.packet[1]);
|
|
1243
|
+
__m128i c = _mm_castph_si128(kernel.packet[2]);
|
|
1244
|
+
__m128i d = _mm_castph_si128(kernel.packet[3]);
|
|
1245
|
+
__m128i e = _mm_castph_si128(kernel.packet[4]);
|
|
1246
|
+
__m128i f = _mm_castph_si128(kernel.packet[5]);
|
|
1247
|
+
__m128i g = _mm_castph_si128(kernel.packet[6]);
|
|
1248
|
+
__m128i h = _mm_castph_si128(kernel.packet[7]);
|
|
1249
|
+
|
|
1250
|
+
__m128i a03b03 = _mm_unpacklo_epi16(a, b);
|
|
1251
|
+
__m128i c03d03 = _mm_unpacklo_epi16(c, d);
|
|
1252
|
+
__m128i e03f03 = _mm_unpacklo_epi16(e, f);
|
|
1253
|
+
__m128i g03h03 = _mm_unpacklo_epi16(g, h);
|
|
1254
|
+
__m128i a47b47 = _mm_unpackhi_epi16(a, b);
|
|
1255
|
+
__m128i c47d47 = _mm_unpackhi_epi16(c, d);
|
|
1256
|
+
__m128i e47f47 = _mm_unpackhi_epi16(e, f);
|
|
1257
|
+
__m128i g47h47 = _mm_unpackhi_epi16(g, h);
|
|
1258
|
+
|
|
1259
|
+
__m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
|
|
1260
|
+
__m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
|
|
1261
|
+
__m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
|
|
1262
|
+
__m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
|
|
1263
|
+
__m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
|
|
1264
|
+
__m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
|
|
1265
|
+
__m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
|
|
1266
|
+
__m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
|
|
1267
|
+
|
|
1268
|
+
__m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
|
|
1269
|
+
__m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
|
|
1270
|
+
__m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
|
|
1271
|
+
__m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
|
|
1272
|
+
__m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
|
|
1273
|
+
__m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
|
|
1274
|
+
__m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
|
|
1275
|
+
__m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
|
|
1276
|
+
|
|
1277
|
+
kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0);
|
|
1278
|
+
kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1);
|
|
1279
|
+
kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2);
|
|
1280
|
+
kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3);
|
|
1281
|
+
kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4);
|
|
1282
|
+
kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5);
|
|
1283
|
+
kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6);
|
|
1284
|
+
kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7);
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
|
|
1288
|
+
EIGEN_ALIGN32 Eigen::half in[4][8];
|
|
1289
|
+
pstore<Eigen::half>(in[0], kernel.packet[0]);
|
|
1290
|
+
pstore<Eigen::half>(in[1], kernel.packet[1]);
|
|
1291
|
+
pstore<Eigen::half>(in[2], kernel.packet[2]);
|
|
1292
|
+
pstore<Eigen::half>(in[3], kernel.packet[3]);
|
|
1293
|
+
|
|
1294
|
+
EIGEN_ALIGN32 Eigen::half out[4][8];
|
|
1295
|
+
|
|
1296
|
+
for (int i = 0; i < 4; ++i) {
|
|
1297
|
+
for (int j = 0; j < 4; ++j) {
|
|
1298
|
+
out[i][j] = in[j][2 * i];
|
|
1299
|
+
}
|
|
1300
|
+
for (int j = 0; j < 4; ++j) {
|
|
1301
|
+
out[i][j + 4] = in[j][2 * i + 1];
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
kernel.packet[0] = pload<Packet8h>(out[0]);
|
|
1306
|
+
kernel.packet[1] = pload<Packet8h>(out[1]);
|
|
1307
|
+
kernel.packet[2] = pload<Packet8h>(out[2]);
|
|
1308
|
+
kernel.packet[3] = pload<Packet8h>(out[3]);
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
// preverse
|
|
1312
|
+
|
|
1313
|
+
template <>
|
|
1314
|
+
EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
|
|
1315
|
+
return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
|
1316
|
+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
|
|
1317
|
+
a);
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
template <>
|
|
1321
|
+
EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
|
|
1322
|
+
__m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
|
1323
|
+
return _mm256_castsi256_ph(_mm256_insertf128_si256(
|
|
1324
|
+
_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)),
|
|
1325
|
+
_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1));
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
template <>
|
|
1329
|
+
EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
|
|
1330
|
+
__m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
|
1331
|
+
return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m));
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
// pscatter
|
|
1335
|
+
|
|
1336
|
+
template <>
|
|
1337
|
+
EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
|
|
1338
|
+
EIGEN_ALIGN64 half aux[32];
|
|
1339
|
+
pstore(aux, from);
|
|
1340
|
+
|
|
1341
|
+
EIGEN_UNROLL_LOOP
|
|
1342
|
+
for (int i = 0; i < 32; i++) {
|
|
1343
|
+
to[stride * i] = aux[i];
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
template <>
|
|
1347
|
+
EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
|
|
1348
|
+
EIGEN_ALIGN64 half aux[16];
|
|
1349
|
+
pstore(aux, from);
|
|
1350
|
+
to[stride * 0] = aux[0];
|
|
1351
|
+
to[stride * 1] = aux[1];
|
|
1352
|
+
to[stride * 2] = aux[2];
|
|
1353
|
+
to[stride * 3] = aux[3];
|
|
1354
|
+
to[stride * 4] = aux[4];
|
|
1355
|
+
to[stride * 5] = aux[5];
|
|
1356
|
+
to[stride * 6] = aux[6];
|
|
1357
|
+
to[stride * 7] = aux[7];
|
|
1358
|
+
to[stride * 8] = aux[8];
|
|
1359
|
+
to[stride * 9] = aux[9];
|
|
1360
|
+
to[stride * 10] = aux[10];
|
|
1361
|
+
to[stride * 11] = aux[11];
|
|
1362
|
+
to[stride * 12] = aux[12];
|
|
1363
|
+
to[stride * 13] = aux[13];
|
|
1364
|
+
to[stride * 14] = aux[14];
|
|
1365
|
+
to[stride * 15] = aux[15];
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
template <>
|
|
1369
|
+
EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
|
|
1370
|
+
EIGEN_ALIGN32 Eigen::half aux[8];
|
|
1371
|
+
pstore(aux, from);
|
|
1372
|
+
to[stride * 0] = aux[0];
|
|
1373
|
+
to[stride * 1] = aux[1];
|
|
1374
|
+
to[stride * 2] = aux[2];
|
|
1375
|
+
to[stride * 3] = aux[3];
|
|
1376
|
+
to[stride * 4] = aux[4];
|
|
1377
|
+
to[stride * 5] = aux[5];
|
|
1378
|
+
to[stride * 6] = aux[6];
|
|
1379
|
+
to[stride * 7] = aux[7];
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
// pgather
|
|
1383
|
+
|
|
1384
|
+
template <>
|
|
1385
|
+
EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
|
|
1386
|
+
return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x,
|
|
1387
|
+
from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x,
|
|
1388
|
+
from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
|
|
1389
|
+
from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x,
|
|
1390
|
+
from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
|
|
1391
|
+
from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
|
|
1392
|
+
from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
|
|
1393
|
+
from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
template <>
|
|
1397
|
+
EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
|
|
1398
|
+
return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
|
|
1399
|
+
from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
|
|
1400
|
+
from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
|
|
1401
|
+
from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
template <>
|
|
1405
|
+
EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
|
|
1406
|
+
return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x,
|
|
1407
|
+
from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
} // end namespace internal
|
|
1411
|
+
} // end namespace Eigen
|
|
1412
|
+
|
|
1413
|
+
#endif // EIGEN_PACKET_MATH_FP16_AVX512_H
|