@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -10,6 +10,9 @@
|
|
|
10
10
|
#ifndef EIGEN_PACKET_MATH_AVX512_H
|
|
11
11
|
#define EIGEN_PACKET_MATH_AVX512_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
@@ -31,9 +34,16 @@ namespace internal {
|
|
|
31
34
|
typedef __m512 Packet16f;
|
|
32
35
|
typedef __m512i Packet16i;
|
|
33
36
|
typedef __m512d Packet8d;
|
|
37
|
+
typedef eigen_packet_wrapper<__m512i, 1> Packet8l;
|
|
38
|
+
#ifndef EIGEN_VECTORIZE_AVX512FP16
|
|
34
39
|
typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
|
|
40
|
+
#endif
|
|
35
41
|
typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
|
|
36
42
|
|
|
43
|
+
typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
|
|
44
|
+
typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
|
|
45
|
+
typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
|
|
46
|
+
|
|
37
47
|
template <>
|
|
38
48
|
struct is_arithmetic<__m512> {
|
|
39
49
|
enum { value = true };
|
|
@@ -46,8 +56,16 @@ template <>
|
|
|
46
56
|
struct is_arithmetic<__m512d> {
|
|
47
57
|
enum { value = true };
|
|
48
58
|
};
|
|
59
|
+
template <>
|
|
60
|
+
struct is_arithmetic<Packet8l> {
|
|
61
|
+
enum { value = true };
|
|
62
|
+
};
|
|
49
63
|
|
|
50
|
-
|
|
64
|
+
#ifndef EIGEN_VECTORIZE_AVX512FP16
|
|
65
|
+
template <>
|
|
66
|
+
struct is_arithmetic<Packet16h> {
|
|
67
|
+
enum { value = true };
|
|
68
|
+
};
|
|
51
69
|
|
|
52
70
|
template <>
|
|
53
71
|
struct packet_traits<half> : default_packet_traits {
|
|
@@ -58,112 +76,114 @@ struct packet_traits<half> : default_packet_traits {
|
|
|
58
76
|
Vectorizable = 1,
|
|
59
77
|
AlignedOnScalar = 1,
|
|
60
78
|
size = 16,
|
|
61
|
-
HasHalfPacket = 1,
|
|
62
79
|
|
|
63
|
-
HasCmp
|
|
64
|
-
HasAdd
|
|
65
|
-
HasSub
|
|
66
|
-
HasMul
|
|
67
|
-
HasDiv
|
|
80
|
+
HasCmp = 1,
|
|
81
|
+
HasAdd = 1,
|
|
82
|
+
HasSub = 1,
|
|
83
|
+
HasMul = 1,
|
|
84
|
+
HasDiv = 1,
|
|
68
85
|
HasNegate = 1,
|
|
69
|
-
HasAbs
|
|
70
|
-
HasAbs2
|
|
71
|
-
HasMin
|
|
72
|
-
HasMax
|
|
73
|
-
HasConj
|
|
86
|
+
HasAbs = 1,
|
|
87
|
+
HasAbs2 = 0,
|
|
88
|
+
HasMin = 1,
|
|
89
|
+
HasMax = 1,
|
|
90
|
+
HasConj = 1,
|
|
74
91
|
HasSetLinear = 0,
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
HasSin = EIGEN_FAST_MATH,
|
|
82
|
-
HasCos = EIGEN_FAST_MATH,
|
|
83
|
-
HasTanh = EIGEN_FAST_MATH,
|
|
84
|
-
HasErf = EIGEN_FAST_MATH,
|
|
85
|
-
HasBlend = 0,
|
|
86
|
-
HasRound = 1,
|
|
87
|
-
HasFloor = 1,
|
|
88
|
-
HasCeil = 1,
|
|
89
|
-
HasRint = 1,
|
|
92
|
+
HasSqrt = 1,
|
|
93
|
+
HasRsqrt = 1,
|
|
94
|
+
HasLog = 1,
|
|
95
|
+
HasLog1p = 1,
|
|
96
|
+
HasExp = 1,
|
|
97
|
+
HasExpm1 = 1,
|
|
90
98
|
HasBessel = 1,
|
|
91
|
-
HasNdtri
|
|
99
|
+
HasNdtri = 1,
|
|
100
|
+
HasSin = EIGEN_FAST_MATH,
|
|
101
|
+
HasCos = EIGEN_FAST_MATH,
|
|
102
|
+
HasTanh = EIGEN_FAST_MATH,
|
|
103
|
+
HasErf = EIGEN_FAST_MATH,
|
|
104
|
+
HasBlend = 0
|
|
92
105
|
};
|
|
93
106
|
};
|
|
107
|
+
#endif
|
|
94
108
|
|
|
95
|
-
template<>
|
|
96
|
-
{
|
|
109
|
+
template <>
|
|
110
|
+
struct packet_traits<float> : default_packet_traits {
|
|
97
111
|
typedef Packet16f type;
|
|
98
112
|
typedef Packet8f half;
|
|
99
113
|
enum {
|
|
100
114
|
Vectorizable = 1,
|
|
101
115
|
AlignedOnScalar = 1,
|
|
102
116
|
size = 16,
|
|
103
|
-
HasHalfPacket = 1,
|
|
104
117
|
|
|
105
118
|
HasAbs = 1,
|
|
106
|
-
HasMin
|
|
107
|
-
HasMax
|
|
108
|
-
HasConj
|
|
109
|
-
HasBlend =
|
|
119
|
+
HasMin = 1,
|
|
120
|
+
HasMax = 1,
|
|
121
|
+
HasConj = 1,
|
|
122
|
+
HasBlend = 1,
|
|
110
123
|
HasSin = EIGEN_FAST_MATH,
|
|
111
124
|
HasCos = EIGEN_FAST_MATH,
|
|
112
|
-
|
|
125
|
+
HasACos = 1,
|
|
126
|
+
HasASin = 1,
|
|
127
|
+
HasATan = 1,
|
|
128
|
+
HasATanh = 1,
|
|
129
|
+
HasSqrt = 1,
|
|
130
|
+
HasRsqrt = 1,
|
|
131
|
+
HasCbrt = 1,
|
|
113
132
|
HasLog = 1,
|
|
114
|
-
HasLog1p
|
|
115
|
-
HasExpm1
|
|
133
|
+
HasLog1p = 1,
|
|
134
|
+
HasExpm1 = 1,
|
|
116
135
|
HasNdtri = 1,
|
|
117
|
-
HasBessel
|
|
136
|
+
HasBessel = 1,
|
|
118
137
|
HasExp = 1,
|
|
119
|
-
|
|
120
|
-
|
|
138
|
+
HasPow = 1,
|
|
139
|
+
HasReciprocal = EIGEN_FAST_MATH,
|
|
121
140
|
HasTanh = EIGEN_FAST_MATH,
|
|
122
141
|
HasErf = EIGEN_FAST_MATH,
|
|
123
|
-
|
|
124
|
-
HasCmp
|
|
125
|
-
HasDiv = 1
|
|
126
|
-
HasRound = 1,
|
|
127
|
-
HasFloor = 1,
|
|
128
|
-
HasCeil = 1,
|
|
129
|
-
HasRint = 1
|
|
142
|
+
HasErfc = EIGEN_FAST_MATH,
|
|
143
|
+
HasCmp = 1,
|
|
144
|
+
HasDiv = 1
|
|
130
145
|
};
|
|
131
|
-
|
|
132
|
-
template<>
|
|
133
|
-
{
|
|
146
|
+
};
|
|
147
|
+
template <>
|
|
148
|
+
struct packet_traits<double> : default_packet_traits {
|
|
134
149
|
typedef Packet8d type;
|
|
135
150
|
typedef Packet4d half;
|
|
136
151
|
enum {
|
|
137
152
|
Vectorizable = 1,
|
|
138
153
|
AlignedOnScalar = 1,
|
|
139
154
|
size = 8,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
155
|
+
HasBlend = 1,
|
|
156
|
+
HasSqrt = 1,
|
|
157
|
+
HasRsqrt = 1,
|
|
158
|
+
HasCbrt = 1,
|
|
159
|
+
HasSin = EIGEN_FAST_MATH,
|
|
160
|
+
HasCos = EIGEN_FAST_MATH,
|
|
161
|
+
HasLog = 1,
|
|
143
162
|
HasExp = 1,
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
HasRint = 1
|
|
163
|
+
HasPow = 1,
|
|
164
|
+
HasATan = 1,
|
|
165
|
+
HasTanh = EIGEN_FAST_MATH,
|
|
166
|
+
HasErf = EIGEN_FAST_MATH,
|
|
167
|
+
HasErfc = EIGEN_FAST_MATH,
|
|
168
|
+
HasATanh = 1,
|
|
169
|
+
HasCmp = 1,
|
|
170
|
+
HasDiv = 1
|
|
153
171
|
};
|
|
154
172
|
};
|
|
155
173
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
{
|
|
174
|
+
template <>
|
|
175
|
+
struct packet_traits<int> : default_packet_traits {
|
|
159
176
|
typedef Packet16i type;
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
177
|
+
typedef Packet8i half;
|
|
178
|
+
enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
template <>
|
|
182
|
+
struct packet_traits<int64_t> : default_packet_traits {
|
|
183
|
+
typedef Packet8l type;
|
|
184
|
+
typedef Packet4l half;
|
|
185
|
+
enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 8 };
|
|
165
186
|
};
|
|
166
|
-
*/
|
|
167
187
|
|
|
168
188
|
template <>
|
|
169
189
|
struct unpacket_traits<Packet16f> {
|
|
@@ -171,26 +191,102 @@ struct unpacket_traits<Packet16f> {
|
|
|
171
191
|
typedef Packet8f half;
|
|
172
192
|
typedef Packet16i integer_packet;
|
|
173
193
|
typedef uint16_t mask_t;
|
|
174
|
-
enum {
|
|
194
|
+
enum {
|
|
195
|
+
size = 16,
|
|
196
|
+
alignment = Aligned64,
|
|
197
|
+
vectorizable = true,
|
|
198
|
+
masked_load_available = true,
|
|
199
|
+
masked_store_available = true,
|
|
200
|
+
masked_fpops_available = true
|
|
201
|
+
};
|
|
175
202
|
};
|
|
176
203
|
template <>
|
|
177
204
|
struct unpacket_traits<Packet8d> {
|
|
178
205
|
typedef double type;
|
|
179
206
|
typedef Packet4d half;
|
|
180
|
-
|
|
207
|
+
typedef Packet8l integer_packet;
|
|
208
|
+
typedef uint8_t mask_t;
|
|
209
|
+
enum {
|
|
210
|
+
size = 8,
|
|
211
|
+
alignment = Aligned64,
|
|
212
|
+
vectorizable = true,
|
|
213
|
+
masked_load_available = true,
|
|
214
|
+
masked_store_available = true,
|
|
215
|
+
masked_fpops_available = true
|
|
216
|
+
};
|
|
181
217
|
};
|
|
182
218
|
template <>
|
|
183
219
|
struct unpacket_traits<Packet16i> {
|
|
184
220
|
typedef int type;
|
|
185
221
|
typedef Packet8i half;
|
|
186
|
-
enum {
|
|
222
|
+
enum {
|
|
223
|
+
size = 16,
|
|
224
|
+
alignment = Aligned64,
|
|
225
|
+
vectorizable = true,
|
|
226
|
+
masked_load_available = false,
|
|
227
|
+
masked_store_available = false
|
|
228
|
+
};
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
template <>
|
|
232
|
+
struct unpacket_traits<Packet8l> {
|
|
233
|
+
typedef int64_t type;
|
|
234
|
+
typedef Packet4l half;
|
|
235
|
+
enum {
|
|
236
|
+
size = 8,
|
|
237
|
+
alignment = Aligned64,
|
|
238
|
+
vectorizable = true,
|
|
239
|
+
masked_load_available = false,
|
|
240
|
+
masked_store_available = false
|
|
241
|
+
};
|
|
187
242
|
};
|
|
188
243
|
|
|
189
|
-
|
|
244
|
+
#ifndef EIGEN_VECTORIZE_AVX512FP16
|
|
245
|
+
template <>
|
|
190
246
|
struct unpacket_traits<Packet16h> {
|
|
191
247
|
typedef Eigen::half type;
|
|
192
248
|
typedef Packet8h half;
|
|
193
|
-
enum {
|
|
249
|
+
enum {
|
|
250
|
+
size = 16,
|
|
251
|
+
alignment = Aligned32,
|
|
252
|
+
vectorizable = true,
|
|
253
|
+
masked_load_available = false,
|
|
254
|
+
masked_store_available = false
|
|
255
|
+
};
|
|
256
|
+
};
|
|
257
|
+
#endif
|
|
258
|
+
|
|
259
|
+
template <>
|
|
260
|
+
struct unpacket_traits<Packet32s> {
|
|
261
|
+
typedef numext::int16_t type;
|
|
262
|
+
typedef Packet16s half;
|
|
263
|
+
enum {
|
|
264
|
+
size = 32,
|
|
265
|
+
alignment = Aligned64,
|
|
266
|
+
vectorizable = false,
|
|
267
|
+
};
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
template <>
|
|
271
|
+
struct unpacket_traits<Packet16s> {
|
|
272
|
+
typedef numext::int16_t type;
|
|
273
|
+
typedef Packet8s half;
|
|
274
|
+
enum {
|
|
275
|
+
size = 16,
|
|
276
|
+
alignment = Aligned32,
|
|
277
|
+
vectorizable = false,
|
|
278
|
+
};
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
template <>
|
|
282
|
+
struct unpacket_traits<Packet8s> {
|
|
283
|
+
typedef numext::int16_t type;
|
|
284
|
+
typedef Packet8s half;
|
|
285
|
+
enum {
|
|
286
|
+
size = 8,
|
|
287
|
+
alignment = Aligned16,
|
|
288
|
+
vectorizable = false,
|
|
289
|
+
};
|
|
194
290
|
};
|
|
195
291
|
|
|
196
292
|
template <>
|
|
@@ -205,6 +301,10 @@ template <>
|
|
|
205
301
|
EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
|
|
206
302
|
return _mm512_set1_epi32(from);
|
|
207
303
|
}
|
|
304
|
+
template <>
|
|
305
|
+
EIGEN_STRONG_INLINE Packet8l pset1<Packet8l>(const int64_t& from) {
|
|
306
|
+
return _mm512_set1_epi64(from);
|
|
307
|
+
}
|
|
208
308
|
|
|
209
309
|
template <>
|
|
210
310
|
EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
|
|
@@ -216,84 +316,151 @@ EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from
|
|
|
216
316
|
return _mm512_castsi512_pd(_mm512_set1_epi64(from));
|
|
217
317
|
}
|
|
218
318
|
|
|
219
|
-
template<>
|
|
220
|
-
|
|
221
|
-
|
|
319
|
+
template <>
|
|
320
|
+
EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) {
|
|
321
|
+
return _mm512_setzero_ps();
|
|
322
|
+
}
|
|
323
|
+
template <>
|
|
324
|
+
EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) {
|
|
325
|
+
return _mm512_setzero_pd();
|
|
326
|
+
}
|
|
327
|
+
template <>
|
|
328
|
+
EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) {
|
|
329
|
+
return _mm512_setzero_si512();
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
template <>
|
|
333
|
+
EIGEN_STRONG_INLINE Packet8l pzero(const Packet8l& /*a*/) {
|
|
334
|
+
return _mm512_setzero_si512();
|
|
335
|
+
}
|
|
222
336
|
|
|
223
|
-
template<>
|
|
224
|
-
|
|
225
|
-
|
|
337
|
+
template <>
|
|
338
|
+
EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
|
|
339
|
+
return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
|
|
226
340
|
}
|
|
227
|
-
template<>
|
|
228
|
-
|
|
229
|
-
|
|
341
|
+
template <>
|
|
342
|
+
EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
|
|
343
|
+
return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
|
|
230
344
|
}
|
|
231
|
-
template<>
|
|
232
|
-
|
|
233
|
-
|
|
345
|
+
template <>
|
|
346
|
+
EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
|
|
347
|
+
return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
|
|
348
|
+
}
|
|
349
|
+
template <>
|
|
350
|
+
EIGEN_STRONG_INLINE Packet8l peven_mask(const Packet8l& /*a*/) {
|
|
351
|
+
return _mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1);
|
|
234
352
|
}
|
|
235
353
|
|
|
236
354
|
template <>
|
|
237
355
|
EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
|
|
356
|
+
#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
|
|
357
|
+
// Inline asm here helps reduce some register spilling in TRSM kernels.
|
|
358
|
+
// See note in unrolls::gemm::microKernel in TrsmKernel.h
|
|
359
|
+
Packet16f ret;
|
|
360
|
+
__asm__("vbroadcastss %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
|
|
361
|
+
return ret;
|
|
362
|
+
#else
|
|
238
363
|
return _mm512_broadcastss_ps(_mm_load_ps1(from));
|
|
364
|
+
#endif
|
|
239
365
|
}
|
|
240
366
|
template <>
|
|
241
367
|
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
|
|
368
|
+
#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
|
|
369
|
+
Packet8d ret;
|
|
370
|
+
__asm__("vbroadcastsd %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
|
|
371
|
+
return ret;
|
|
372
|
+
#else
|
|
242
373
|
return _mm512_set1_pd(*from);
|
|
374
|
+
#endif
|
|
243
375
|
}
|
|
244
376
|
|
|
245
377
|
template <>
|
|
246
378
|
EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
|
|
247
|
-
return _mm512_add_ps(
|
|
248
|
-
|
|
249
|
-
_mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
|
|
250
|
-
4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
|
|
379
|
+
return _mm512_add_ps(_mm512_set1_ps(a), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
|
|
380
|
+
6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
|
|
251
381
|
}
|
|
252
382
|
template <>
|
|
253
383
|
EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
|
|
254
|
-
return _mm512_add_pd(_mm512_set1_pd(a),
|
|
255
|
-
|
|
384
|
+
return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
|
|
385
|
+
}
|
|
386
|
+
template <>
|
|
387
|
+
EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
|
|
388
|
+
return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
|
|
389
|
+
}
|
|
390
|
+
template <>
|
|
391
|
+
EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
|
|
392
|
+
return _mm512_add_epi64(_mm512_set1_epi64(a), _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0));
|
|
256
393
|
}
|
|
257
394
|
|
|
258
395
|
template <>
|
|
259
|
-
EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
|
|
260
|
-
const Packet16f& b) {
|
|
396
|
+
EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
261
397
|
return _mm512_add_ps(a, b);
|
|
262
398
|
}
|
|
263
399
|
template <>
|
|
264
|
-
EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
|
|
265
|
-
const Packet8d& b) {
|
|
400
|
+
EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
266
401
|
return _mm512_add_pd(a, b);
|
|
267
402
|
}
|
|
268
403
|
template <>
|
|
269
|
-
EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
|
|
270
|
-
const Packet16i& b) {
|
|
404
|
+
EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
271
405
|
return _mm512_add_epi32(a, b);
|
|
272
406
|
}
|
|
407
|
+
template <>
|
|
408
|
+
EIGEN_STRONG_INLINE Packet8l padd<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
409
|
+
return _mm512_add_epi64(a, b);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
template <>
|
|
413
|
+
EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b, uint16_t umask) {
|
|
414
|
+
__mmask16 mask = static_cast<__mmask16>(umask);
|
|
415
|
+
return _mm512_maskz_add_ps(mask, a, b);
|
|
416
|
+
}
|
|
417
|
+
template <>
|
|
418
|
+
EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b, uint8_t umask) {
|
|
419
|
+
__mmask8 mask = static_cast<__mmask8>(umask);
|
|
420
|
+
return _mm512_maskz_add_pd(mask, a, b);
|
|
421
|
+
}
|
|
273
422
|
|
|
274
423
|
template <>
|
|
275
|
-
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
|
|
276
|
-
const Packet16f& b) {
|
|
424
|
+
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
277
425
|
return _mm512_sub_ps(a, b);
|
|
278
426
|
}
|
|
279
427
|
template <>
|
|
280
|
-
EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
|
|
281
|
-
const Packet8d& b) {
|
|
428
|
+
EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
282
429
|
return _mm512_sub_pd(a, b);
|
|
283
430
|
}
|
|
284
431
|
template <>
|
|
285
|
-
EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
|
|
286
|
-
const Packet16i& b) {
|
|
432
|
+
EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
287
433
|
return _mm512_sub_epi32(a, b);
|
|
288
434
|
}
|
|
435
|
+
template <>
|
|
436
|
+
EIGEN_STRONG_INLINE Packet8l psub<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
437
|
+
return _mm512_sub_epi64(a, b);
|
|
438
|
+
}
|
|
289
439
|
|
|
290
440
|
template <>
|
|
291
441
|
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
|
|
292
|
-
|
|
442
|
+
// NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
|
|
443
|
+
// The intel docs give it a relatively high latency as well, so we're probably
|
|
444
|
+
// better off with using _mm512_set_epi32 directly anyways.
|
|
445
|
+
const __m512i mask =
|
|
446
|
+
_mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
|
447
|
+
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
|
|
448
|
+
return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
|
|
293
449
|
}
|
|
294
450
|
template <>
|
|
295
451
|
EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
|
|
296
|
-
|
|
452
|
+
const __m512i mask =
|
|
453
|
+
_mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
|
|
454
|
+
0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
|
|
455
|
+
return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
|
|
456
|
+
}
|
|
457
|
+
template <>
|
|
458
|
+
EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
|
|
459
|
+
return _mm512_sub_epi32(_mm512_setzero_si512(), a);
|
|
460
|
+
}
|
|
461
|
+
template <>
|
|
462
|
+
EIGEN_STRONG_INLINE Packet8l pnegate(const Packet8l& a) {
|
|
463
|
+
return _mm512_sub_epi64(_mm512_setzero_si512(), a);
|
|
297
464
|
}
|
|
298
465
|
|
|
299
466
|
template <>
|
|
@@ -308,144 +475,217 @@ template <>
|
|
|
308
475
|
EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
|
|
309
476
|
return a;
|
|
310
477
|
}
|
|
478
|
+
template <>
|
|
479
|
+
EIGEN_STRONG_INLINE Packet8l pconj(const Packet8l& a) {
|
|
480
|
+
return a;
|
|
481
|
+
}
|
|
311
482
|
|
|
312
483
|
template <>
|
|
313
|
-
EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
|
|
314
|
-
const Packet16f& b) {
|
|
484
|
+
EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
315
485
|
return _mm512_mul_ps(a, b);
|
|
316
486
|
}
|
|
317
487
|
template <>
|
|
318
|
-
EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
|
|
319
|
-
const Packet8d& b) {
|
|
488
|
+
EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
320
489
|
return _mm512_mul_pd(a, b);
|
|
321
490
|
}
|
|
322
491
|
template <>
|
|
323
|
-
EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
|
|
324
|
-
const Packet16i& b) {
|
|
492
|
+
EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
325
493
|
return _mm512_mullo_epi32(a, b);
|
|
326
494
|
}
|
|
495
|
+
template <>
|
|
496
|
+
EIGEN_STRONG_INLINE Packet8l pmul<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
497
|
+
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
498
|
+
return _mm512_mullo_epi64(a, b);
|
|
499
|
+
#else
|
|
500
|
+
return _mm512_mullox_epi64(a, b);
|
|
501
|
+
#endif
|
|
502
|
+
}
|
|
327
503
|
|
|
328
504
|
template <>
|
|
329
|
-
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
|
|
330
|
-
const Packet16f& b) {
|
|
505
|
+
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
331
506
|
return _mm512_div_ps(a, b);
|
|
332
507
|
}
|
|
508
|
+
|
|
333
509
|
template <>
|
|
334
|
-
EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
|
|
335
|
-
const Packet8d& b) {
|
|
510
|
+
EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
336
511
|
return _mm512_div_pd(a, b);
|
|
337
512
|
}
|
|
338
513
|
|
|
514
|
+
template <>
|
|
515
|
+
EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
516
|
+
Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
|
|
517
|
+
Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
|
|
518
|
+
return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
|
|
519
|
+
}
|
|
520
|
+
|
|
339
521
|
#ifdef EIGEN_VECTORIZE_FMA
|
|
340
522
|
template <>
|
|
341
|
-
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
|
|
342
|
-
const Packet16f& c) {
|
|
523
|
+
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
|
|
343
524
|
return _mm512_fmadd_ps(a, b, c);
|
|
344
525
|
}
|
|
345
526
|
template <>
|
|
346
|
-
EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
|
|
347
|
-
const Packet8d& c) {
|
|
527
|
+
EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
|
|
348
528
|
return _mm512_fmadd_pd(a, b, c);
|
|
349
529
|
}
|
|
530
|
+
|
|
531
|
+
template <>
|
|
532
|
+
EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
|
|
533
|
+
return _mm512_fmsub_ps(a, b, c);
|
|
534
|
+
}
|
|
535
|
+
template <>
|
|
536
|
+
EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
|
|
537
|
+
return _mm512_fmsub_pd(a, b, c);
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
template <>
|
|
541
|
+
EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
|
|
542
|
+
return _mm512_fnmadd_ps(a, b, c);
|
|
543
|
+
}
|
|
544
|
+
template <>
|
|
545
|
+
EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
|
|
546
|
+
return _mm512_fnmadd_pd(a, b, c);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
template <>
|
|
550
|
+
EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
|
|
551
|
+
return _mm512_fnmsub_ps(a, b, c);
|
|
552
|
+
}
|
|
553
|
+
template <>
|
|
554
|
+
EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
|
|
555
|
+
return _mm512_fnmsub_pd(a, b, c);
|
|
556
|
+
}
|
|
350
557
|
#endif
|
|
351
558
|
|
|
352
559
|
template <>
|
|
353
|
-
EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
|
|
354
|
-
|
|
355
|
-
const Packet16f& b) {
|
|
356
|
-
__mmask16 mask16 = _mm512_cmp_epi32_mask(
|
|
357
|
-
_mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
|
|
560
|
+
EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
|
|
561
|
+
__mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
|
|
358
562
|
return _mm512_mask_blend_ps(mask16, a, b);
|
|
359
563
|
}
|
|
360
564
|
|
|
361
565
|
template <>
|
|
362
|
-
EIGEN_DEVICE_FUNC inline
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
566
|
+
EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask, const Packet16i& a, const Packet16i& b) {
|
|
567
|
+
__mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
|
|
568
|
+
return _mm512_mask_blend_epi32(mask16, a, b);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
template <>
|
|
572
|
+
EIGEN_DEVICE_FUNC inline Packet8l pselect(const Packet8l& mask, const Packet8l& a, const Packet8l& b) {
|
|
573
|
+
__mmask8 mask8 = _mm512_cmpeq_epi64_mask(mask, _mm512_setzero_si512());
|
|
574
|
+
return _mm512_mask_blend_epi64(mask8, a, b);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
template <>
|
|
578
|
+
EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, const Packet8d& a, const Packet8d& b) {
|
|
579
|
+
__mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
|
|
367
580
|
return _mm512_mask_blend_pd(mask8, a, b);
|
|
368
581
|
}
|
|
369
582
|
|
|
370
583
|
template <>
|
|
371
|
-
EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
|
|
372
|
-
const Packet16f& b) {
|
|
584
|
+
EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
373
585
|
// Arguments are reversed to match NaN propagation behavior of std::min.
|
|
374
586
|
return _mm512_min_ps(b, a);
|
|
375
587
|
}
|
|
376
588
|
template <>
|
|
377
|
-
EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
|
|
378
|
-
const Packet8d& b) {
|
|
589
|
+
EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
379
590
|
// Arguments are reversed to match NaN propagation behavior of std::min.
|
|
380
591
|
return _mm512_min_pd(b, a);
|
|
381
592
|
}
|
|
593
|
+
template <>
|
|
594
|
+
EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
595
|
+
return _mm512_min_epi32(b, a);
|
|
596
|
+
}
|
|
597
|
+
template <>
|
|
598
|
+
EIGEN_STRONG_INLINE Packet8l pmin<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
599
|
+
return _mm512_min_epi64(b, a);
|
|
600
|
+
}
|
|
382
601
|
|
|
383
602
|
template <>
|
|
384
|
-
EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
|
|
385
|
-
const Packet16f& b) {
|
|
603
|
+
EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
386
604
|
// Arguments are reversed to match NaN propagation behavior of std::max.
|
|
387
605
|
return _mm512_max_ps(b, a);
|
|
388
606
|
}
|
|
389
607
|
template <>
|
|
390
|
-
EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
|
|
391
|
-
const Packet8d& b) {
|
|
608
|
+
EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
392
609
|
// Arguments are reversed to match NaN propagation behavior of std::max.
|
|
393
610
|
return _mm512_max_pd(b, a);
|
|
394
611
|
}
|
|
612
|
+
template <>
|
|
613
|
+
EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
614
|
+
return _mm512_max_epi32(b, a);
|
|
615
|
+
}
|
|
616
|
+
template <>
|
|
617
|
+
EIGEN_STRONG_INLINE Packet8l pmax<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
618
|
+
return _mm512_max_epi64(b, a);
|
|
619
|
+
}
|
|
395
620
|
|
|
396
|
-
// Add specializations for min/max with prescribed NaN
|
|
397
|
-
template<>
|
|
621
|
+
// Add specializations for min/max with prescribed NaN propagation.
|
|
622
|
+
template <>
|
|
398
623
|
EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
399
624
|
return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
|
|
400
625
|
}
|
|
401
|
-
template<>
|
|
626
|
+
template <>
|
|
402
627
|
EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
403
628
|
return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
|
|
404
629
|
}
|
|
405
|
-
template<>
|
|
630
|
+
template <>
|
|
406
631
|
EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
407
632
|
return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
|
|
408
633
|
}
|
|
409
|
-
template<>
|
|
634
|
+
template <>
|
|
410
635
|
EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
411
636
|
return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
|
|
412
637
|
}
|
|
413
|
-
template<>
|
|
638
|
+
template <>
|
|
414
639
|
EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
415
640
|
return pminmax_propagate_nan(a, b, pmin<Packet16f>);
|
|
416
641
|
}
|
|
417
|
-
template<>
|
|
642
|
+
template <>
|
|
418
643
|
EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
419
644
|
return pminmax_propagate_nan(a, b, pmin<Packet8d>);
|
|
420
645
|
}
|
|
421
|
-
template<>
|
|
646
|
+
template <>
|
|
422
647
|
EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
423
648
|
return pminmax_propagate_nan(a, b, pmax<Packet16f>);
|
|
424
649
|
}
|
|
425
|
-
template<>
|
|
650
|
+
template <>
|
|
426
651
|
EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
427
652
|
return pminmax_propagate_nan(a, b, pmax<Packet8d>);
|
|
428
653
|
}
|
|
429
654
|
|
|
430
|
-
|
|
431
655
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
432
|
-
template<int I_>
|
|
433
|
-
|
|
434
|
-
|
|
656
|
+
template <int I_>
|
|
657
|
+
EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
|
|
658
|
+
return _mm512_extractf32x8_ps(x, I_);
|
|
659
|
+
}
|
|
660
|
+
template <int I_>
|
|
661
|
+
EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
|
|
662
|
+
return _mm512_extractf64x2_pd(x, I_);
|
|
663
|
+
}
|
|
664
|
+
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
|
|
665
|
+
return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
|
|
666
|
+
}
|
|
667
|
+
EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
|
|
668
|
+
return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1);
|
|
669
|
+
}
|
|
435
670
|
#else
|
|
436
671
|
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
|
|
437
|
-
template<int I_>
|
|
438
|
-
|
|
672
|
+
template <int I_>
|
|
673
|
+
EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
|
|
674
|
+
return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(x), I_));
|
|
439
675
|
}
|
|
440
676
|
|
|
441
677
|
// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
|
|
442
|
-
template<int I_>
|
|
443
|
-
|
|
678
|
+
template <int I_>
|
|
679
|
+
EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
|
|
680
|
+
return _mm_castsi128_pd(_mm512_extracti32x4_epi32(_mm512_castpd_si512(x), I_));
|
|
444
681
|
}
|
|
445
682
|
|
|
446
683
|
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
|
|
447
|
-
return _mm512_castsi512_ps(
|
|
448
|
-
|
|
684
|
+
return _mm512_castsi512_ps(
|
|
685
|
+
_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b), 1));
|
|
686
|
+
}
|
|
687
|
+
EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
|
|
688
|
+
return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
|
|
449
689
|
}
|
|
450
690
|
#endif
|
|
451
691
|
|
|
@@ -461,80 +701,137 @@ EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
|
|
|
461
701
|
// dst[255:240] := Saturate16(rf[255:224])
|
|
462
702
|
__m256i lo = _mm256_castps_si256(extract256<0>(rf));
|
|
463
703
|
__m256i hi = _mm256_castps_si256(extract256<1>(rf));
|
|
464
|
-
__m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
|
|
465
|
-
|
|
466
|
-
__m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
|
|
467
|
-
_mm256_extractf128_si256(hi, 1));
|
|
704
|
+
__m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), _mm256_extractf128_si256(lo, 1));
|
|
705
|
+
__m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), _mm256_extractf128_si256(hi, 1));
|
|
468
706
|
return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
|
|
469
707
|
}
|
|
470
708
|
|
|
709
|
+
template <>
|
|
710
|
+
EIGEN_STRONG_INLINE Packet16f pisnan(const Packet16f& a) {
|
|
711
|
+
__mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_UNORD_Q);
|
|
712
|
+
return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(mask, int32_t(-1)));
|
|
713
|
+
}
|
|
714
|
+
|
|
471
715
|
template <>
|
|
472
716
|
EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
|
|
473
717
|
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
|
|
474
|
-
return _mm512_castsi512_ps(
|
|
475
|
-
_mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
|
|
718
|
+
return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
|
|
476
719
|
}
|
|
477
|
-
template<>
|
|
720
|
+
template <>
|
|
721
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
|
|
478
722
|
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
|
|
479
|
-
return _mm512_castsi512_ps(
|
|
480
|
-
_mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
|
|
723
|
+
return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
|
|
481
724
|
}
|
|
482
725
|
|
|
483
|
-
template<>
|
|
726
|
+
template <>
|
|
727
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
|
|
484
728
|
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
|
|
485
|
-
return _mm512_castsi512_ps(
|
|
486
|
-
_mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
|
|
729
|
+
return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
|
|
487
730
|
}
|
|
488
731
|
|
|
489
|
-
template<>
|
|
732
|
+
template <>
|
|
733
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
|
|
490
734
|
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
|
|
491
|
-
return _mm512_castsi512_ps(
|
|
492
|
-
_mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
|
|
735
|
+
return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
|
|
493
736
|
}
|
|
494
737
|
|
|
495
|
-
template<>
|
|
496
|
-
|
|
497
|
-
|
|
738
|
+
template <>
|
|
739
|
+
EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
|
|
740
|
+
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
|
|
741
|
+
return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
|
|
742
|
+
}
|
|
743
|
+
template <>
|
|
744
|
+
EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
|
|
745
|
+
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
|
|
746
|
+
return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
|
|
747
|
+
}
|
|
748
|
+
template <>
|
|
749
|
+
EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
|
|
750
|
+
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
|
|
751
|
+
return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
|
|
498
752
|
}
|
|
499
753
|
|
|
754
|
+
template <>
|
|
755
|
+
EIGEN_STRONG_INLINE Packet8l pcmp_eq(const Packet8l& a, const Packet8l& b) {
|
|
756
|
+
__mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ);
|
|
757
|
+
return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
|
|
758
|
+
}
|
|
759
|
+
template <>
|
|
760
|
+
EIGEN_STRONG_INLINE Packet8l pcmp_le(const Packet8l& a, const Packet8l& b) {
|
|
761
|
+
__mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LE);
|
|
762
|
+
return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
|
|
763
|
+
}
|
|
764
|
+
template <>
|
|
765
|
+
EIGEN_STRONG_INLINE Packet8l pcmp_lt(const Packet8l& a, const Packet8l& b) {
|
|
766
|
+
__mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT);
|
|
767
|
+
return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
|
|
768
|
+
}
|
|
500
769
|
|
|
501
770
|
template <>
|
|
502
771
|
EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
|
|
503
772
|
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
|
|
504
|
-
return _mm512_castsi512_pd(
|
|
505
|
-
_mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
|
|
773
|
+
return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
|
|
506
774
|
}
|
|
507
775
|
template <>
|
|
508
776
|
EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
|
|
509
777
|
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
|
|
510
|
-
return _mm512_castsi512_pd(
|
|
511
|
-
_mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
|
|
778
|
+
return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
|
|
512
779
|
}
|
|
513
780
|
template <>
|
|
514
781
|
EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
|
|
515
782
|
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
|
|
516
|
-
return _mm512_castsi512_pd(
|
|
517
|
-
_mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
|
|
783
|
+
return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
|
|
518
784
|
}
|
|
519
785
|
template <>
|
|
520
786
|
EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
|
|
521
787
|
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
|
|
522
|
-
return _mm512_castsi512_pd(
|
|
523
|
-
_mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
|
|
788
|
+
return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
|
|
524
789
|
}
|
|
525
790
|
|
|
526
|
-
template<>
|
|
527
|
-
|
|
791
|
+
template <>
|
|
792
|
+
EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) {
|
|
793
|
+
return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION);
|
|
794
|
+
}
|
|
795
|
+
template <>
|
|
796
|
+
EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) {
|
|
797
|
+
return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION);
|
|
798
|
+
}
|
|
528
799
|
|
|
529
|
-
template<>
|
|
530
|
-
|
|
800
|
+
template <>
|
|
801
|
+
EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) {
|
|
802
|
+
return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF);
|
|
803
|
+
}
|
|
804
|
+
template <>
|
|
805
|
+
EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) {
|
|
806
|
+
return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF);
|
|
807
|
+
}
|
|
531
808
|
|
|
532
|
-
template<>
|
|
533
|
-
|
|
809
|
+
template <>
|
|
810
|
+
EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) {
|
|
811
|
+
return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF);
|
|
812
|
+
}
|
|
813
|
+
template <>
|
|
814
|
+
EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) {
|
|
815
|
+
return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
template <>
|
|
819
|
+
EIGEN_STRONG_INLINE Packet16f ptrunc<Packet16f>(const Packet16f& a) {
|
|
820
|
+
return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
|
|
821
|
+
}
|
|
822
|
+
template <>
|
|
823
|
+
EIGEN_STRONG_INLINE Packet8d ptrunc<Packet8d>(const Packet8d& a) {
|
|
824
|
+
return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO);
|
|
825
|
+
}
|
|
534
826
|
|
|
535
827
|
template <>
|
|
536
828
|
EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
|
|
537
|
-
return _mm512_set1_epi32(
|
|
829
|
+
return _mm512_set1_epi32(int32_t(-1));
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
template <>
|
|
833
|
+
EIGEN_STRONG_INLINE Packet8l ptrue<Packet8l>(const Packet8l& /*a*/) {
|
|
834
|
+
return _mm512_set1_epi64(int64_t(-1));
|
|
538
835
|
}
|
|
539
836
|
|
|
540
837
|
template <>
|
|
@@ -548,23 +845,25 @@ EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
|
|
|
548
845
|
}
|
|
549
846
|
|
|
550
847
|
template <>
|
|
551
|
-
EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
|
|
552
|
-
|
|
553
|
-
return _mm512_and_si512(a,b);
|
|
848
|
+
EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, const Packet16i& b) {
|
|
849
|
+
return _mm512_and_si512(a, b);
|
|
554
850
|
}
|
|
555
851
|
|
|
556
852
|
template <>
|
|
557
|
-
EIGEN_STRONG_INLINE
|
|
558
|
-
|
|
853
|
+
EIGEN_STRONG_INLINE Packet8l pand<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
854
|
+
return _mm512_and_si512(a, b);
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
template <>
|
|
858
|
+
EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
559
859
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
560
860
|
return _mm512_and_ps(a, b);
|
|
561
861
|
#else
|
|
562
|
-
return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
|
862
|
+
return _mm512_castsi512_ps(pand(_mm512_castps_si512(a), _mm512_castps_si512(b)));
|
|
563
863
|
#endif
|
|
564
864
|
}
|
|
565
865
|
template <>
|
|
566
|
-
EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
|
|
567
|
-
const Packet8d& b) {
|
|
866
|
+
EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
568
867
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
569
868
|
return _mm512_and_pd(a, b);
|
|
570
869
|
#else
|
|
@@ -584,22 +883,26 @@ EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i
|
|
|
584
883
|
return _mm512_or_si512(a, b);
|
|
585
884
|
}
|
|
586
885
|
|
|
886
|
+
template <>
|
|
887
|
+
EIGEN_STRONG_INLINE Packet8l por<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
888
|
+
return _mm512_or_si512(a, b);
|
|
889
|
+
}
|
|
890
|
+
|
|
587
891
|
template <>
|
|
588
892
|
EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
589
893
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
590
894
|
return _mm512_or_ps(a, b);
|
|
591
895
|
#else
|
|
592
|
-
return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
|
896
|
+
return _mm512_castsi512_ps(por(_mm512_castps_si512(a), _mm512_castps_si512(b)));
|
|
593
897
|
#endif
|
|
594
898
|
}
|
|
595
899
|
|
|
596
900
|
template <>
|
|
597
|
-
EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
|
|
598
|
-
const Packet8d& b) {
|
|
901
|
+
EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
599
902
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
600
903
|
return _mm512_or_pd(a, b);
|
|
601
904
|
#else
|
|
602
|
-
return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
|
905
|
+
return _mm512_castsi512_pd(por(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
|
|
603
906
|
#endif
|
|
604
907
|
}
|
|
605
908
|
|
|
@@ -608,12 +911,17 @@ EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16
|
|
|
608
911
|
return _mm512_xor_si512(a, b);
|
|
609
912
|
}
|
|
610
913
|
|
|
914
|
+
template <>
|
|
915
|
+
EIGEN_STRONG_INLINE Packet8l pxor<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
916
|
+
return _mm512_xor_si512(a, b);
|
|
917
|
+
}
|
|
918
|
+
|
|
611
919
|
template <>
|
|
612
920
|
EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
613
921
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
614
922
|
return _mm512_xor_ps(a, b);
|
|
615
923
|
#else
|
|
616
|
-
return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
|
924
|
+
return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a), _mm512_castps_si512(b)));
|
|
617
925
|
#endif
|
|
618
926
|
}
|
|
619
927
|
|
|
@@ -622,7 +930,7 @@ EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b
|
|
|
622
930
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
623
931
|
return _mm512_xor_pd(a, b);
|
|
624
932
|
#else
|
|
625
|
-
return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
|
933
|
+
return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
|
|
626
934
|
#endif
|
|
627
935
|
}
|
|
628
936
|
|
|
@@ -631,50 +939,73 @@ EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packe
|
|
|
631
939
|
return _mm512_andnot_si512(b, a);
|
|
632
940
|
}
|
|
633
941
|
|
|
942
|
+
template <>
|
|
943
|
+
EIGEN_STRONG_INLINE Packet8l pandnot<Packet8l>(const Packet8l& a, const Packet8l& b) {
|
|
944
|
+
return _mm512_andnot_si512(b, a);
|
|
945
|
+
}
|
|
946
|
+
|
|
634
947
|
template <>
|
|
635
948
|
EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
636
949
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
637
950
|
return _mm512_andnot_ps(b, a);
|
|
638
951
|
#else
|
|
639
|
-
return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
|
|
952
|
+
return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a), _mm512_castps_si512(b)));
|
|
640
953
|
#endif
|
|
641
954
|
}
|
|
642
955
|
template <>
|
|
643
|
-
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
|
|
956
|
+
EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, const Packet8d& b) {
|
|
644
957
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
645
958
|
return _mm512_andnot_pd(b, a);
|
|
646
959
|
#else
|
|
647
|
-
return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
|
|
960
|
+
return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
|
|
648
961
|
#endif
|
|
649
962
|
}
|
|
650
963
|
|
|
651
|
-
template<>
|
|
652
|
-
{
|
|
964
|
+
template <>
|
|
965
|
+
EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a) {
|
|
653
966
|
// Work-around for default std::round rounding mode.
|
|
654
967
|
const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
|
|
655
968
|
const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
|
|
656
969
|
return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
|
|
657
970
|
}
|
|
658
|
-
template<>
|
|
659
|
-
{
|
|
971
|
+
template <>
|
|
972
|
+
EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a) {
|
|
660
973
|
// Work-around for default std::round rounding mode.
|
|
661
974
|
const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
|
|
662
975
|
const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
|
|
663
976
|
return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
|
|
664
977
|
}
|
|
665
978
|
|
|
666
|
-
template<int N>
|
|
979
|
+
template <int N>
|
|
980
|
+
EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
|
|
667
981
|
return _mm512_srai_epi32(a, N);
|
|
668
982
|
}
|
|
669
983
|
|
|
670
|
-
template<int N>
|
|
984
|
+
template <int N>
|
|
985
|
+
EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
|
|
671
986
|
return _mm512_srli_epi32(a, N);
|
|
672
987
|
}
|
|
673
988
|
|
|
674
|
-
template<int N>
|
|
989
|
+
template <int N>
|
|
990
|
+
EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
|
|
675
991
|
return _mm512_slli_epi32(a, N);
|
|
676
992
|
}
|
|
677
993
|
|
|
994
|
+
template <int N>
|
|
995
|
+
EIGEN_STRONG_INLINE Packet8l parithmetic_shift_right(Packet8l a) {
|
|
996
|
+
return _mm512_srai_epi64(a, N);
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
template <int N>
|
|
1000
|
+
EIGEN_STRONG_INLINE Packet8l plogical_shift_right(Packet8l a) {
|
|
1001
|
+
return _mm512_srli_epi64(a, N);
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
template <int N>
|
|
1005
|
+
EIGEN_STRONG_INLINE Packet8l plogical_shift_left(Packet8l a) {
|
|
1006
|
+
return _mm512_slli_epi64(a, N);
|
|
1007
|
+
}
|
|
1008
|
+
|
|
678
1009
|
template <>
|
|
679
1010
|
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
|
|
680
1011
|
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
|
|
@@ -685,8 +1016,11 @@ EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
|
|
|
685
1016
|
}
|
|
686
1017
|
template <>
|
|
687
1018
|
EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
|
|
688
|
-
EIGEN_DEBUG_ALIGNED_LOAD return
|
|
689
|
-
|
|
1019
|
+
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
|
|
1020
|
+
}
|
|
1021
|
+
template <>
|
|
1022
|
+
EIGEN_STRONG_INLINE Packet8l pload<Packet8l>(const int64_t* from) {
|
|
1023
|
+
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
|
|
690
1024
|
}
|
|
691
1025
|
|
|
692
1026
|
template <>
|
|
@@ -699,8 +1033,11 @@ EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
|
|
|
699
1033
|
}
|
|
700
1034
|
template <>
|
|
701
1035
|
EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
|
|
702
|
-
EIGEN_DEBUG_UNALIGNED_LOAD return
|
|
703
|
-
|
|
1036
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from);
|
|
1037
|
+
}
|
|
1038
|
+
template <>
|
|
1039
|
+
EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(const int64_t* from) {
|
|
1040
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from);
|
|
704
1041
|
}
|
|
705
1042
|
|
|
706
1043
|
template <>
|
|
@@ -708,6 +1045,11 @@ EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umas
|
|
|
708
1045
|
__mmask16 mask = static_cast<__mmask16>(umask);
|
|
709
1046
|
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
|
|
710
1047
|
}
|
|
1048
|
+
template <>
|
|
1049
|
+
EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from, uint8_t umask) {
|
|
1050
|
+
__mmask8 mask = static_cast<__mmask8>(umask);
|
|
1051
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_pd(mask, from);
|
|
1052
|
+
}
|
|
711
1053
|
|
|
712
1054
|
// Loads 8 floats from memory a returns the packet
|
|
713
1055
|
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
|
|
@@ -715,43 +1057,46 @@ template <>
|
|
|
715
1057
|
EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
|
|
716
1058
|
// an unaligned load is required here as there is no requirement
|
|
717
1059
|
// on the alignment of input pointer 'from'
|
|
718
|
-
__m256i low_half =
|
|
1060
|
+
__m256i low_half = _mm256_castps_si256(_mm256_loadu_ps(from));
|
|
719
1061
|
__m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
|
|
720
1062
|
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
|
|
721
1063
|
return pairs;
|
|
722
1064
|
}
|
|
723
1065
|
|
|
724
|
-
|
|
725
|
-
// FIXME: this does not look optimal, better load a Packet4d and shuffle...
|
|
726
|
-
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
|
|
1066
|
+
// Loads 4 doubles from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3,
|
|
727
1067
|
// a3}
|
|
728
1068
|
template <>
|
|
729
1069
|
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
|
|
734
|
-
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
|
|
735
|
-
return x;
|
|
1070
|
+
Packet8d tmp = _mm512_castpd256_pd512(ploadu<Packet4d>(from));
|
|
1071
|
+
const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
|
|
1072
|
+
return _mm512_permutexvar_pd(scatter_mask, tmp);
|
|
736
1073
|
}
|
|
737
|
-
|
|
1074
|
+
|
|
1075
|
+
// Loads 4 int64_t from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3,
|
|
1076
|
+
// a3}
|
|
738
1077
|
template <>
|
|
739
|
-
EIGEN_STRONG_INLINE
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
1078
|
+
EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
|
|
1079
|
+
Packet8l tmp = _mm512_castsi256_si512(ploadu<Packet4l>(from));
|
|
1080
|
+
const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
|
|
1081
|
+
return _mm512_permutexvar_epi64(scatter_mask, tmp);
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
// Loads 8 integers from memory and returns the packet
|
|
1085
|
+
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
|
|
1086
|
+
template <>
|
|
1087
|
+
EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int* from) {
|
|
1088
|
+
__m256i low_half = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
|
|
1089
|
+
__m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
|
|
1090
|
+
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
|
|
1091
|
+
return _mm512_castps_si512(pairs);
|
|
746
1092
|
}
|
|
747
|
-
#endif
|
|
748
1093
|
|
|
749
1094
|
// Loads 4 floats from memory a returns the packet
|
|
750
1095
|
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
|
751
1096
|
template <>
|
|
752
1097
|
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
|
753
1098
|
Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
|
|
754
|
-
const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
|
|
1099
|
+
const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
|
|
755
1100
|
return _mm512_permutexvar_ps(scatter_mask, tmp);
|
|
756
1101
|
}
|
|
757
1102
|
|
|
@@ -760,12 +1105,32 @@ EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
|
|
760
1105
|
template <>
|
|
761
1106
|
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
|
|
762
1107
|
__m256d lane0 = _mm256_set1_pd(*from);
|
|
763
|
-
__m256d lane1 = _mm256_set1_pd(*(from+1));
|
|
1108
|
+
__m256d lane1 = _mm256_set1_pd(*(from + 1));
|
|
764
1109
|
__m512d tmp = _mm512_undefined_pd();
|
|
765
1110
|
tmp = _mm512_insertf64x4(tmp, lane0, 0);
|
|
766
1111
|
return _mm512_insertf64x4(tmp, lane1, 1);
|
|
767
1112
|
}
|
|
768
1113
|
|
|
1114
|
+
// Loads 2 int64_t from memory a returns the packet
|
|
1115
|
+
// {a0, a0 a0, a0, a1, a1, a1, a1}
|
|
1116
|
+
template <>
|
|
1117
|
+
EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
|
|
1118
|
+
__m256i lane0 = _mm256_set1_epi64x(*from);
|
|
1119
|
+
__m256i lane1 = _mm256_set1_epi64x(*(from + 1));
|
|
1120
|
+
__m512i tmp = _mm512_undefined_epi32();
|
|
1121
|
+
tmp = _mm512_inserti64x4(tmp, lane0, 0);
|
|
1122
|
+
return _mm512_inserti64x4(tmp, lane1, 1);
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
// Loads 4 integers from memory and returns the packet
|
|
1126
|
+
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
|
1127
|
+
template <>
|
|
1128
|
+
EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
|
|
1129
|
+
Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
|
|
1130
|
+
const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
|
|
1131
|
+
return _mm512_permutexvar_epi32(scatter_mask, tmp);
|
|
1132
|
+
}
|
|
1133
|
+
|
|
769
1134
|
template <>
|
|
770
1135
|
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
|
|
771
1136
|
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
|
|
@@ -776,8 +1141,11 @@ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
|
|
|
776
1141
|
}
|
|
777
1142
|
template <>
|
|
778
1143
|
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
|
|
779
|
-
EIGEN_DEBUG_ALIGNED_STORE
|
|
780
|
-
|
|
1144
|
+
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi32(to, from);
|
|
1145
|
+
}
|
|
1146
|
+
template <>
|
|
1147
|
+
EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet8l& from) {
|
|
1148
|
+
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi64(to, from);
|
|
781
1149
|
}
|
|
782
1150
|
|
|
783
1151
|
template <>
|
|
@@ -790,54 +1158,128 @@ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
|
|
|
790
1158
|
}
|
|
791
1159
|
template <>
|
|
792
1160
|
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
|
|
793
|
-
EIGEN_DEBUG_UNALIGNED_STORE
|
|
794
|
-
|
|
1161
|
+
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
|
|
1162
|
+
}
|
|
1163
|
+
template <>
|
|
1164
|
+
EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet8l& from) {
|
|
1165
|
+
EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
|
|
795
1166
|
}
|
|
796
1167
|
template <>
|
|
797
1168
|
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
|
|
798
1169
|
__mmask16 mask = static_cast<__mmask16>(umask);
|
|
799
1170
|
EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
|
|
800
1171
|
}
|
|
1172
|
+
template <>
|
|
1173
|
+
EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from, uint8_t umask) {
|
|
1174
|
+
__mmask8 mask = static_cast<__mmask8>(umask);
|
|
1175
|
+
EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_pd(to, mask, from);
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
template <typename Scalar, typename Packet>
|
|
1179
|
+
EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from, Index stride,
|
|
1180
|
+
typename unpacket_traits<Packet>::mask_t umask);
|
|
1181
|
+
template <>
|
|
1182
|
+
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src, const float* from, Index stride,
|
|
1183
|
+
uint16_t umask) {
|
|
1184
|
+
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
|
1185
|
+
Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
1186
|
+
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
|
1187
|
+
__mmask16 mask = static_cast<__mmask16>(umask);
|
|
1188
|
+
|
|
1189
|
+
return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
|
|
1190
|
+
}
|
|
1191
|
+
template <>
|
|
1192
|
+
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src, const double* from, Index stride,
|
|
1193
|
+
uint8_t umask) {
|
|
1194
|
+
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
|
1195
|
+
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
1196
|
+
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
|
1197
|
+
__mmask8 mask = static_cast<__mmask8>(umask);
|
|
1198
|
+
|
|
1199
|
+
return _mm512_mask_i32gather_pd(src, mask, indices, from, 8);
|
|
1200
|
+
}
|
|
801
1201
|
|
|
802
1202
|
template <>
|
|
803
|
-
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
|
804
|
-
Index stride) {
|
|
1203
|
+
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
|
|
805
1204
|
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
|
806
|
-
Packet16i stride_multiplier =
|
|
807
|
-
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
1205
|
+
Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
808
1206
|
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
|
809
1207
|
|
|
810
1208
|
return _mm512_i32gather_ps(indices, from, 4);
|
|
811
1209
|
}
|
|
812
1210
|
template <>
|
|
813
|
-
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
|
|
814
|
-
Index stride) {
|
|
1211
|
+
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, Index stride) {
|
|
815
1212
|
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
|
816
1213
|
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
817
1214
|
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
|
818
1215
|
|
|
819
1216
|
return _mm512_i32gather_pd(indices, from, 8);
|
|
820
1217
|
}
|
|
1218
|
+
template <>
|
|
1219
|
+
EIGEN_DEVICE_FUNC inline Packet8l pgather<int64_t, Packet8l>(const int64_t* from, Index stride) {
|
|
1220
|
+
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
|
1221
|
+
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
1222
|
+
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
|
1223
|
+
|
|
1224
|
+
return _mm512_i32gather_epi64(indices, from, 8);
|
|
1225
|
+
}
|
|
1226
|
+
template <>
|
|
1227
|
+
EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from, Index stride) {
|
|
1228
|
+
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
|
1229
|
+
Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
1230
|
+
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
|
1231
|
+
return _mm512_i32gather_epi32(indices, from, 4);
|
|
1232
|
+
}
|
|
821
1233
|
|
|
1234
|
+
template <typename Scalar, typename Packet>
|
|
1235
|
+
EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index stride,
|
|
1236
|
+
typename unpacket_traits<Packet>::mask_t umask);
|
|
1237
|
+
template <>
|
|
1238
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride,
|
|
1239
|
+
uint16_t umask) {
|
|
1240
|
+
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
|
1241
|
+
Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
1242
|
+
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
|
1243
|
+
__mmask16 mask = static_cast<__mmask16>(umask);
|
|
1244
|
+
_mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
|
|
1245
|
+
}
|
|
1246
|
+
template <>
|
|
1247
|
+
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride,
|
|
1248
|
+
uint8_t umask) {
|
|
1249
|
+
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
|
1250
|
+
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
1251
|
+
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
|
1252
|
+
__mmask8 mask = static_cast<__mmask8>(umask);
|
|
1253
|
+
_mm512_mask_i32scatter_pd(to, mask, indices, from, 8);
|
|
1254
|
+
}
|
|
822
1255
|
template <>
|
|
823
|
-
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
|
|
824
|
-
const Packet16f& from,
|
|
825
|
-
Index stride) {
|
|
1256
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
|
|
826
1257
|
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
|
827
|
-
Packet16i stride_multiplier =
|
|
828
|
-
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
1258
|
+
Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
829
1259
|
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
|
830
1260
|
_mm512_i32scatter_ps(to, indices, from, 4);
|
|
831
1261
|
}
|
|
832
1262
|
template <>
|
|
833
|
-
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
|
|
834
|
-
const Packet8d& from,
|
|
835
|
-
Index stride) {
|
|
1263
|
+
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride) {
|
|
836
1264
|
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
|
837
1265
|
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
838
1266
|
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
|
839
1267
|
_mm512_i32scatter_pd(to, indices, from, 8);
|
|
840
1268
|
}
|
|
1269
|
+
template <>
|
|
1270
|
+
EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet8l>(int64_t* to, const Packet8l& from, Index stride) {
|
|
1271
|
+
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
|
1272
|
+
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
1273
|
+
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
|
1274
|
+
_mm512_i32scatter_epi64(to, indices, from, 8);
|
|
1275
|
+
}
|
|
1276
|
+
template <>
|
|
1277
|
+
EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to, const Packet16i& from, Index stride) {
|
|
1278
|
+
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
|
1279
|
+
Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
1280
|
+
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
|
1281
|
+
_mm512_i32scatter_epi32(to, indices, from, 4);
|
|
1282
|
+
}
|
|
841
1283
|
|
|
842
1284
|
template <>
|
|
843
1285
|
EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
|
|
@@ -854,81 +1296,142 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
|
|
|
854
1296
|
Packet16i pa = pset1<Packet16i>(a);
|
|
855
1297
|
pstore(to, pa);
|
|
856
1298
|
}
|
|
1299
|
+
template <>
|
|
1300
|
+
EIGEN_STRONG_INLINE void pstore1<Packet8l>(int64_t* to, const int64_t& a) {
|
|
1301
|
+
Packet8l pa = pset1<Packet8l>(a);
|
|
1302
|
+
pstore(to, pa);
|
|
1303
|
+
}
|
|
857
1304
|
|
|
858
|
-
template<>
|
|
859
|
-
|
|
860
|
-
|
|
1305
|
+
template <>
|
|
1306
|
+
EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
|
|
1307
|
+
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
|
|
1308
|
+
}
|
|
1309
|
+
template <>
|
|
1310
|
+
EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
|
|
1311
|
+
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
|
|
1312
|
+
}
|
|
1313
|
+
template <>
|
|
1314
|
+
EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
|
|
1315
|
+
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
|
|
1316
|
+
}
|
|
861
1317
|
|
|
862
1318
|
template <>
|
|
863
1319
|
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
|
|
864
|
-
return
|
|
1320
|
+
return _mm512_cvtss_f32(a);
|
|
865
1321
|
}
|
|
866
1322
|
template <>
|
|
867
1323
|
EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
|
|
868
|
-
return
|
|
1324
|
+
return _mm512_cvtsd_f64(a);
|
|
1325
|
+
}
|
|
1326
|
+
template <>
|
|
1327
|
+
EIGEN_STRONG_INLINE int64_t pfirst<Packet8l>(const Packet8l& a) {
|
|
1328
|
+
int64_t x = _mm_extract_epi64_0(_mm512_extracti32x4_epi32(a, 0));
|
|
1329
|
+
return x;
|
|
869
1330
|
}
|
|
870
1331
|
template <>
|
|
871
1332
|
EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
|
|
872
|
-
|
|
1333
|
+
#if EIGEN_GNUC_STRICT_LESS_THAN(11, 0, 0)
|
|
1334
|
+
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
|
1335
|
+
#else
|
|
1336
|
+
return _mm512_cvtsi512_si32(a);
|
|
1337
|
+
#endif
|
|
873
1338
|
}
|
|
874
1339
|
|
|
875
|
-
template<>
|
|
876
|
-
{
|
|
1340
|
+
template <>
|
|
1341
|
+
EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
|
|
877
1342
|
return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
|
|
878
1343
|
}
|
|
879
1344
|
|
|
880
|
-
template<>
|
|
881
|
-
{
|
|
1345
|
+
template <>
|
|
1346
|
+
EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) {
|
|
882
1347
|
return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
|
|
883
1348
|
}
|
|
884
1349
|
|
|
885
|
-
template<>
|
|
886
|
-
{
|
|
1350
|
+
template <>
|
|
1351
|
+
EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) {
|
|
1352
|
+
return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
template <>
|
|
1356
|
+
EIGEN_STRONG_INLINE Packet8l preverse(const Packet8l& a) {
|
|
1357
|
+
return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
template <>
|
|
1361
|
+
EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
|
|
887
1362
|
// _mm512_abs_ps intrinsic not found, so hack around it
|
|
888
1363
|
return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
|
|
889
1364
|
}
|
|
890
1365
|
template <>
|
|
891
1366
|
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
|
892
1367
|
// _mm512_abs_ps intrinsic not found, so hack around it
|
|
893
|
-
return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
|
|
894
|
-
|
|
1368
|
+
return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
|
|
1369
|
+
}
|
|
1370
|
+
template <>
|
|
1371
|
+
EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) {
|
|
1372
|
+
return _mm512_abs_epi32(a);
|
|
1373
|
+
}
|
|
1374
|
+
template <>
|
|
1375
|
+
EIGEN_STRONG_INLINE Packet8l pabs(const Packet8l& a) {
|
|
1376
|
+
return _mm512_abs_epi64(a);
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
#ifndef EIGEN_VECTORIZE_AVX512FP16
|
|
1380
|
+
template <>
|
|
1381
|
+
EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
|
|
1382
|
+
return _mm256_srai_epi16(a, 15);
|
|
1383
|
+
}
|
|
1384
|
+
#endif // EIGEN_VECTORIZE_AVX512FP16
|
|
1385
|
+
|
|
1386
|
+
template <>
|
|
1387
|
+
EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
|
|
1388
|
+
return _mm256_srai_epi16(a, 15);
|
|
1389
|
+
}
|
|
1390
|
+
template <>
|
|
1391
|
+
EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) {
|
|
1392
|
+
return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31));
|
|
1393
|
+
}
|
|
1394
|
+
template <>
|
|
1395
|
+
EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) {
|
|
1396
|
+
return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63));
|
|
895
1397
|
}
|
|
896
1398
|
|
|
897
|
-
template<>
|
|
898
|
-
EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
|
|
1399
|
+
template <>
|
|
1400
|
+
EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
|
|
899
1401
|
return pfrexp_generic(a, exponent);
|
|
900
1402
|
}
|
|
901
1403
|
|
|
902
1404
|
// Extract exponent without existence of Packet8l.
|
|
903
|
-
template<>
|
|
904
|
-
EIGEN_STRONG_INLINE
|
|
905
|
-
Packet8d
|
|
906
|
-
|
|
907
|
-
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
1405
|
+
template <>
|
|
1406
|
+
EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
|
|
1407
|
+
const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
|
|
1408
|
+
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
908
1409
|
return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
|
|
909
|
-
|
|
1410
|
+
#else
|
|
910
1411
|
return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
|
|
911
|
-
|
|
1412
|
+
#endif
|
|
912
1413
|
}
|
|
913
1414
|
|
|
914
|
-
template<>
|
|
1415
|
+
template <>
|
|
915
1416
|
EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
|
|
916
1417
|
return pfrexp_generic(a, exponent);
|
|
917
1418
|
}
|
|
918
1419
|
|
|
919
|
-
template<>
|
|
1420
|
+
template <>
|
|
1421
|
+
EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
|
|
920
1422
|
return pldexp_generic(a, exponent);
|
|
921
1423
|
}
|
|
922
1424
|
|
|
923
|
-
template<>
|
|
1425
|
+
template <>
|
|
1426
|
+
EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
|
|
924
1427
|
// Clamp exponent to [-2099, 2099]
|
|
925
1428
|
const Packet8d max_exponent = pset1<Packet8d>(2099.0);
|
|
926
1429
|
const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
|
|
927
|
-
|
|
1430
|
+
|
|
928
1431
|
// Split 2^e into four factors and multiply.
|
|
929
1432
|
const Packet8i bias = pset1<Packet8i>(1023);
|
|
930
1433
|
Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4)
|
|
931
|
-
|
|
1434
|
+
|
|
932
1435
|
// 2^b
|
|
933
1436
|
const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
|
934
1437
|
Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
|
|
@@ -936,7 +1439,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
|
|
|
936
1439
|
hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
|
|
937
1440
|
Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
|
|
938
1441
|
Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
|
|
939
|
-
|
|
1442
|
+
|
|
940
1443
|
// 2^(e - 3b)
|
|
941
1444
|
b = psub(psub(psub(e, b), b), b); // e - 3b
|
|
942
1445
|
hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
|
|
@@ -949,57 +1452,49 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
|
|
|
949
1452
|
|
|
950
1453
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
951
1454
|
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
|
|
952
|
-
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)
|
|
953
|
-
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0);
|
|
1455
|
+
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
|
|
1456
|
+
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
|
|
954
1457
|
__m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
|
|
1458
|
+
|
|
1459
|
+
// AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i
|
|
1460
|
+
#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
|
|
1461
|
+
__m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
|
|
1462
|
+
__m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
|
|
955
1463
|
#else
|
|
956
|
-
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)
|
|
957
|
-
__m256 OUTPUT##_0 = _mm256_insertf128_ps(
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
1464
|
+
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
|
|
1465
|
+
__m256 OUTPUT##_0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
|
|
1466
|
+
_mm512_extractf32x4_ps(INPUT, 1), 1); \
|
|
1467
|
+
__m256 OUTPUT##_1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
|
|
1468
|
+
_mm512_extractf32x4_ps(INPUT, 3), 1)
|
|
1469
|
+
|
|
1470
|
+
#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
|
|
1471
|
+
__m256i OUTPUT##_0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
|
|
1472
|
+
_mm512_extracti32x4_epi32(INPUT, 1), 1); \
|
|
1473
|
+
__m256i OUTPUT##_1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
|
|
1474
|
+
_mm512_extracti32x4_epi32(INPUT, 3), 1)
|
|
963
1475
|
#endif
|
|
964
1476
|
|
|
965
1477
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
966
1478
|
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
|
967
1479
|
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
|
|
1480
|
+
|
|
1481
|
+
#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
|
|
1482
|
+
OUTPUT = _mm512_inserti32x8(_mm512_castsi256_si512(INPUTA), INPUTB, 1);
|
|
968
1483
|
#else
|
|
969
1484
|
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
|
970
1485
|
OUTPUT = _mm512_undefined_ps(); \
|
|
971
1486
|
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
|
|
972
1487
|
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
|
|
973
|
-
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
|
|
974
|
-
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
return predux<Packet8f>(x);
|
|
984
|
-
#else
|
|
985
|
-
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
|
986
|
-
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
|
987
|
-
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
|
988
|
-
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
|
989
|
-
__m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
|
|
990
|
-
sum = _mm_hadd_ps(sum, sum);
|
|
991
|
-
sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
|
|
992
|
-
return _mm_cvtss_f32(sum);
|
|
993
|
-
#endif
|
|
994
|
-
}
|
|
995
|
-
template <>
|
|
996
|
-
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
|
997
|
-
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
|
998
|
-
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
|
999
|
-
__m256d sum = _mm256_add_pd(lane0, lane1);
|
|
1000
|
-
__m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
|
|
1001
|
-
return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
|
|
1002
|
-
}
|
|
1488
|
+
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
|
|
1489
|
+
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
|
|
1490
|
+
|
|
1491
|
+
#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
|
|
1492
|
+
OUTPUT = _mm512_undefined_epi32(); \
|
|
1493
|
+
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
|
|
1494
|
+
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
|
|
1495
|
+
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 0), 2); \
|
|
1496
|
+
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
|
|
1497
|
+
#endif
|
|
1003
1498
|
|
|
1004
1499
|
template <>
|
|
1005
1500
|
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
|
|
@@ -1023,84 +1518,30 @@ EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
|
|
|
1023
1518
|
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
|
1024
1519
|
return _mm256_add_pd(lane0, lane1);
|
|
1025
1520
|
}
|
|
1026
|
-
|
|
1027
1521
|
template <>
|
|
1028
|
-
EIGEN_STRONG_INLINE
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
Packet8f res = pmul(lane0, lane1);
|
|
1034
|
-
res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
|
|
1035
|
-
res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
|
1036
|
-
return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
|
1522
|
+
EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
|
|
1523
|
+
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
1524
|
+
__m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
|
|
1525
|
+
__m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
|
|
1526
|
+
return _mm256_add_epi32(lane0, lane1);
|
|
1037
1527
|
#else
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
return
|
|
1528
|
+
__m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
|
|
1529
|
+
__m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
|
|
1530
|
+
__m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
|
|
1531
|
+
__m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
|
|
1532
|
+
__m128i sum0 = _mm_add_epi32(lane0, lane2);
|
|
1533
|
+
__m128i sum1 = _mm_add_epi32(lane1, lane3);
|
|
1534
|
+
return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1);
|
|
1045
1535
|
#endif
|
|
1046
1536
|
}
|
|
1047
|
-
template <>
|
|
1048
|
-
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
|
|
1049
|
-
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
|
1050
|
-
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
|
1051
|
-
__m256d res = pmul(lane0, lane1);
|
|
1052
|
-
res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
|
|
1053
|
-
return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
|
|
1054
|
-
}
|
|
1055
|
-
|
|
1056
|
-
template <>
|
|
1057
|
-
EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
|
|
1058
|
-
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
|
1059
|
-
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
|
1060
|
-
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
|
1061
|
-
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
|
1062
|
-
__m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
|
|
1063
|
-
res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
|
1064
|
-
return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
|
1065
|
-
}
|
|
1066
|
-
template <>
|
|
1067
|
-
EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
|
|
1068
|
-
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
|
1069
|
-
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
|
1070
|
-
__m256d res = _mm256_min_pd(lane0, lane1);
|
|
1071
|
-
res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
|
1072
|
-
return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
|
1073
|
-
}
|
|
1074
|
-
|
|
1075
|
-
template <>
|
|
1076
|
-
EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
|
|
1077
|
-
__m128 lane0 = _mm512_extractf32x4_ps(a, 0);
|
|
1078
|
-
__m128 lane1 = _mm512_extractf32x4_ps(a, 1);
|
|
1079
|
-
__m128 lane2 = _mm512_extractf32x4_ps(a, 2);
|
|
1080
|
-
__m128 lane3 = _mm512_extractf32x4_ps(a, 3);
|
|
1081
|
-
__m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
|
|
1082
|
-
res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
|
|
1083
|
-
return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
|
|
1084
|
-
}
|
|
1085
1537
|
|
|
1086
1538
|
template <>
|
|
1087
|
-
EIGEN_STRONG_INLINE
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
|
|
1092
|
-
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
|
1093
|
-
}
|
|
1094
|
-
|
|
1095
|
-
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
|
|
1096
|
-
{
|
|
1097
|
-
Packet16i xi = _mm512_castps_si512(x);
|
|
1098
|
-
__mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
|
|
1099
|
-
return !_mm512_kortestz(tmp,tmp);
|
|
1539
|
+
EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
|
|
1540
|
+
__m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
|
|
1541
|
+
__m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
|
|
1542
|
+
return _mm256_add_epi64(lane0, lane1);
|
|
1100
1543
|
}
|
|
1101
1544
|
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
1545
|
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
|
|
1105
1546
|
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
|
|
1106
1547
|
|
|
@@ -1215,9 +1656,46 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
|
|
|
1215
1656
|
PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
|
|
1216
1657
|
PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
|
|
1217
1658
|
}
|
|
1218
|
-
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)
|
|
1219
|
-
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX],
|
|
1220
|
-
|
|
1659
|
+
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
|
|
1660
|
+
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
|
|
1661
|
+
|
|
1662
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
|
|
1663
|
+
__m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
|
|
1664
|
+
__m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
|
|
1665
|
+
__m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
|
|
1666
|
+
__m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
|
|
1667
|
+
__m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
|
|
1668
|
+
__m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
|
|
1669
|
+
__m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
|
|
1670
|
+
__m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
|
|
1671
|
+
|
|
1672
|
+
kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
|
|
1673
|
+
kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
|
|
1674
|
+
kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
|
|
1675
|
+
kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
|
|
1676
|
+
kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
|
|
1677
|
+
kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
|
|
1678
|
+
kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
|
|
1679
|
+
kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
|
|
1680
|
+
|
|
1681
|
+
T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
|
|
1682
|
+
T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
|
|
1683
|
+
T2 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0x44);
|
|
1684
|
+
T3 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0xee);
|
|
1685
|
+
T4 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0x44);
|
|
1686
|
+
T5 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0xee);
|
|
1687
|
+
T6 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0x44);
|
|
1688
|
+
T7 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0xee);
|
|
1689
|
+
|
|
1690
|
+
kernel.packet[0] = _mm512_shuffle_f32x4(T0, T2, 0x88);
|
|
1691
|
+
kernel.packet[2] = _mm512_shuffle_f32x4(T0, T2, 0xdd);
|
|
1692
|
+
kernel.packet[1] = _mm512_shuffle_f32x4(T4, T6, 0x88);
|
|
1693
|
+
kernel.packet[3] = _mm512_shuffle_f32x4(T4, T6, 0xdd);
|
|
1694
|
+
kernel.packet[4] = _mm512_shuffle_f32x4(T1, T3, 0x88);
|
|
1695
|
+
kernel.packet[6] = _mm512_shuffle_f32x4(T1, T3, 0xdd);
|
|
1696
|
+
kernel.packet[5] = _mm512_shuffle_f32x4(T5, T7, 0x88);
|
|
1697
|
+
kernel.packet[7] = _mm512_shuffle_f32x4(T5, T7, 0xdd);
|
|
1698
|
+
}
|
|
1221
1699
|
|
|
1222
1700
|
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
|
|
1223
1701
|
__m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
|
|
@@ -1259,8 +1737,11 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
|
|
|
1259
1737
|
|
|
1260
1738
|
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
|
|
1261
1739
|
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
|
|
1262
|
-
OUTPUT[INDEX] =
|
|
1263
|
-
|
|
1740
|
+
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
|
|
1741
|
+
|
|
1742
|
+
#define PACK_OUTPUT_L(OUTPUT, INPUT, INDEX, STRIDE) \
|
|
1743
|
+
OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
|
|
1744
|
+
OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
|
|
1264
1745
|
|
|
1265
1746
|
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
|
|
1266
1747
|
__m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
|
|
@@ -1270,23 +1751,15 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
|
|
|
1270
1751
|
|
|
1271
1752
|
PacketBlock<Packet4d, 8> tmp;
|
|
1272
1753
|
|
|
1273
|
-
tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
|
|
1274
|
-
|
|
1275
|
-
tmp.packet[
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
tmp.packet[
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
|
|
1283
|
-
_mm512_extractf64x4_pd(T2, 1), 0x20);
|
|
1284
|
-
tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
|
|
1285
|
-
_mm512_extractf64x4_pd(T3, 1), 0x20);
|
|
1286
|
-
tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
|
|
1287
|
-
_mm512_extractf64x4_pd(T2, 1), 0x31);
|
|
1288
|
-
tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
|
|
1289
|
-
_mm512_extractf64x4_pd(T3, 1), 0x31);
|
|
1754
|
+
tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x20);
|
|
1755
|
+
tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x20);
|
|
1756
|
+
tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x31);
|
|
1757
|
+
tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x31);
|
|
1758
|
+
|
|
1759
|
+
tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x20);
|
|
1760
|
+
tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x20);
|
|
1761
|
+
tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x31);
|
|
1762
|
+
tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x31);
|
|
1290
1763
|
|
|
1291
1764
|
PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
|
|
1292
1765
|
PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
|
|
@@ -1304,107 +1777,347 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
|
|
|
1304
1777
|
__m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
|
|
1305
1778
|
__m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
|
|
1306
1779
|
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1780
|
+
kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
|
|
1781
|
+
kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
|
|
1782
|
+
kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
|
|
1783
|
+
kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
|
|
1784
|
+
kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
|
|
1785
|
+
kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
|
|
1786
|
+
kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
|
|
1787
|
+
kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
|
|
1788
|
+
kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
|
|
1789
|
+
kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
|
|
1790
|
+
kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
|
|
1791
|
+
kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
|
|
1792
|
+
kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
|
|
1793
|
+
kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
|
|
1794
|
+
kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
|
|
1795
|
+
kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
|
|
1796
|
+
|
|
1797
|
+
T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
|
|
1798
|
+
T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
|
|
1799
|
+
T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
|
|
1800
|
+
T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
|
|
1801
|
+
T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
|
|
1802
|
+
T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
|
|
1803
|
+
T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
|
|
1804
|
+
T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
|
|
1805
|
+
T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
|
|
1806
|
+
T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
|
|
1807
|
+
T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
|
|
1808
|
+
T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
|
|
1809
|
+
T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
|
|
1810
|
+
T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
|
|
1811
|
+
T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
|
|
1812
|
+
T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
|
|
1813
|
+
|
|
1814
|
+
kernel.packet[0] = T0;
|
|
1815
|
+
kernel.packet[1] = T1;
|
|
1816
|
+
kernel.packet[2] = T2;
|
|
1817
|
+
kernel.packet[3] = T3;
|
|
1818
|
+
kernel.packet[4] = T4;
|
|
1819
|
+
kernel.packet[5] = T5;
|
|
1820
|
+
kernel.packet[6] = T6;
|
|
1821
|
+
kernel.packet[7] = T7;
|
|
1822
|
+
}
|
|
1823
|
+
|
|
1824
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
|
|
1825
|
+
__m512i T0 = _mm512_castpd_si512(
|
|
1826
|
+
_mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0));
|
|
1827
|
+
__m512i T1 = _mm512_castpd_si512(
|
|
1828
|
+
_mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0xff));
|
|
1829
|
+
__m512i T2 = _mm512_castpd_si512(
|
|
1830
|
+
_mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0));
|
|
1831
|
+
__m512i T3 = _mm512_castpd_si512(
|
|
1832
|
+
_mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0xff));
|
|
1833
|
+
|
|
1834
|
+
PacketBlock<Packet4l, 8> tmp;
|
|
1835
|
+
|
|
1836
|
+
tmp.packet[0] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x20);
|
|
1837
|
+
tmp.packet[1] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x20);
|
|
1838
|
+
tmp.packet[2] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x31);
|
|
1839
|
+
tmp.packet[3] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x31);
|
|
1840
|
+
|
|
1841
|
+
tmp.packet[4] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x20);
|
|
1842
|
+
tmp.packet[5] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x20);
|
|
1843
|
+
tmp.packet[6] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x31);
|
|
1844
|
+
tmp.packet[7] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x31);
|
|
1845
|
+
|
|
1846
|
+
PACK_OUTPUT_L(kernel.packet, tmp.packet, 0, 1);
|
|
1847
|
+
PACK_OUTPUT_L(kernel.packet, tmp.packet, 1, 1);
|
|
1848
|
+
PACK_OUTPUT_L(kernel.packet, tmp.packet, 2, 1);
|
|
1849
|
+
PACK_OUTPUT_L(kernel.packet, tmp.packet, 3, 1);
|
|
1850
|
+
}
|
|
1851
|
+
|
|
1852
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
|
|
1853
|
+
__m512i T0 = _mm512_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
|
|
1854
|
+
__m512i T1 = _mm512_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
|
|
1855
|
+
__m512i T2 = _mm512_unpacklo_epi64(kernel.packet[2], kernel.packet[3]);
|
|
1856
|
+
__m512i T3 = _mm512_unpackhi_epi64(kernel.packet[2], kernel.packet[3]);
|
|
1857
|
+
__m512i T4 = _mm512_unpacklo_epi64(kernel.packet[4], kernel.packet[5]);
|
|
1858
|
+
__m512i T5 = _mm512_unpackhi_epi64(kernel.packet[4], kernel.packet[5]);
|
|
1859
|
+
__m512i T6 = _mm512_unpacklo_epi64(kernel.packet[6], kernel.packet[7]);
|
|
1860
|
+
__m512i T7 = _mm512_unpackhi_epi64(kernel.packet[6], kernel.packet[7]);
|
|
1861
|
+
|
|
1862
|
+
kernel.packet[0] = _mm512_permutex_epi64(T2, 0x4E);
|
|
1863
|
+
kernel.packet[0] = _mm512_mask_blend_epi64(0xCC, T0, kernel.packet[0]);
|
|
1864
|
+
kernel.packet[2] = _mm512_permutex_epi64(T0, 0x4E);
|
|
1865
|
+
kernel.packet[2] = _mm512_mask_blend_epi64(0xCC, kernel.packet[2], T2);
|
|
1866
|
+
kernel.packet[1] = _mm512_permutex_epi64(T3, 0x4E);
|
|
1867
|
+
kernel.packet[1] = _mm512_mask_blend_epi64(0xCC, T1, kernel.packet[1]);
|
|
1868
|
+
kernel.packet[3] = _mm512_permutex_epi64(T1, 0x4E);
|
|
1869
|
+
kernel.packet[3] = _mm512_mask_blend_epi64(0xCC, kernel.packet[3], T3);
|
|
1870
|
+
kernel.packet[4] = _mm512_permutex_epi64(T6, 0x4E);
|
|
1871
|
+
kernel.packet[4] = _mm512_mask_blend_epi64(0xCC, T4, kernel.packet[4]);
|
|
1872
|
+
kernel.packet[6] = _mm512_permutex_epi64(T4, 0x4E);
|
|
1873
|
+
kernel.packet[6] = _mm512_mask_blend_epi64(0xCC, kernel.packet[6], T6);
|
|
1874
|
+
kernel.packet[5] = _mm512_permutex_epi64(T7, 0x4E);
|
|
1875
|
+
kernel.packet[5] = _mm512_mask_blend_epi64(0xCC, T5, kernel.packet[5]);
|
|
1876
|
+
kernel.packet[7] = _mm512_permutex_epi64(T5, 0x4E);
|
|
1877
|
+
kernel.packet[7] = _mm512_mask_blend_epi64(0xCC, kernel.packet[7], T7);
|
|
1878
|
+
|
|
1879
|
+
T0 = _mm512_shuffle_i64x2(kernel.packet[4], kernel.packet[4], 0x4E);
|
|
1880
|
+
T0 = _mm512_mask_blend_epi64(0xF0, kernel.packet[0], T0);
|
|
1881
|
+
T4 = _mm512_shuffle_i64x2(kernel.packet[0], kernel.packet[0], 0x4E);
|
|
1882
|
+
T4 = _mm512_mask_blend_epi64(0xF0, T4, kernel.packet[4]);
|
|
1883
|
+
T1 = _mm512_shuffle_i64x2(kernel.packet[5], kernel.packet[5], 0x4E);
|
|
1884
|
+
T1 = _mm512_mask_blend_epi64(0xF0, kernel.packet[1], T1);
|
|
1885
|
+
T5 = _mm512_shuffle_i64x2(kernel.packet[1], kernel.packet[1], 0x4E);
|
|
1886
|
+
T5 = _mm512_mask_blend_epi64(0xF0, T5, kernel.packet[5]);
|
|
1887
|
+
T2 = _mm512_shuffle_i64x2(kernel.packet[6], kernel.packet[6], 0x4E);
|
|
1888
|
+
T2 = _mm512_mask_blend_epi64(0xF0, kernel.packet[2], T2);
|
|
1889
|
+
T6 = _mm512_shuffle_i64x2(kernel.packet[2], kernel.packet[2], 0x4E);
|
|
1890
|
+
T6 = _mm512_mask_blend_epi64(0xF0, T6, kernel.packet[6]);
|
|
1891
|
+
T3 = _mm512_shuffle_i64x2(kernel.packet[7], kernel.packet[7], 0x4E);
|
|
1892
|
+
T3 = _mm512_mask_blend_epi64(0xF0, kernel.packet[3], T3);
|
|
1893
|
+
T7 = _mm512_shuffle_i64x2(kernel.packet[3], kernel.packet[3], 0x4E);
|
|
1894
|
+
T7 = _mm512_mask_blend_epi64(0xF0, T7, kernel.packet[7]);
|
|
1895
|
+
|
|
1896
|
+
kernel.packet[0] = T0;
|
|
1897
|
+
kernel.packet[1] = T1;
|
|
1898
|
+
kernel.packet[2] = T2;
|
|
1899
|
+
kernel.packet[3] = T3;
|
|
1900
|
+
kernel.packet[4] = T4;
|
|
1901
|
+
kernel.packet[5] = T5;
|
|
1902
|
+
kernel.packet[6] = T6;
|
|
1903
|
+
kernel.packet[7] = T7;
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
#define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
|
|
1907
|
+
EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
|
|
1908
|
+
|
|
1909
|
+
#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
|
|
1910
|
+
EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
|
|
1911
|
+
|
|
1912
|
+
#define SHUFFLE_EPI32(A, B, M) _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
|
|
1913
|
+
|
|
1914
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
|
|
1915
|
+
__m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
|
|
1916
|
+
__m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
|
|
1917
|
+
__m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
|
|
1918
|
+
__m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
|
|
1919
|
+
__m512i T4 = _mm512_unpacklo_epi32(kernel.packet[4], kernel.packet[5]);
|
|
1920
|
+
__m512i T5 = _mm512_unpackhi_epi32(kernel.packet[4], kernel.packet[5]);
|
|
1921
|
+
__m512i T6 = _mm512_unpacklo_epi32(kernel.packet[6], kernel.packet[7]);
|
|
1922
|
+
__m512i T7 = _mm512_unpackhi_epi32(kernel.packet[6], kernel.packet[7]);
|
|
1923
|
+
__m512i T8 = _mm512_unpacklo_epi32(kernel.packet[8], kernel.packet[9]);
|
|
1924
|
+
__m512i T9 = _mm512_unpackhi_epi32(kernel.packet[8], kernel.packet[9]);
|
|
1925
|
+
__m512i T10 = _mm512_unpacklo_epi32(kernel.packet[10], kernel.packet[11]);
|
|
1926
|
+
__m512i T11 = _mm512_unpackhi_epi32(kernel.packet[10], kernel.packet[11]);
|
|
1927
|
+
__m512i T12 = _mm512_unpacklo_epi32(kernel.packet[12], kernel.packet[13]);
|
|
1928
|
+
__m512i T13 = _mm512_unpackhi_epi32(kernel.packet[12], kernel.packet[13]);
|
|
1929
|
+
__m512i T14 = _mm512_unpacklo_epi32(kernel.packet[14], kernel.packet[15]);
|
|
1930
|
+
__m512i T15 = _mm512_unpackhi_epi32(kernel.packet[14], kernel.packet[15]);
|
|
1931
|
+
__m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1932
|
+
__m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1933
|
+
__m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1934
|
+
__m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1935
|
+
__m512i S4 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1936
|
+
__m512i S5 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1937
|
+
__m512i S6 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1938
|
+
__m512i S7 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1939
|
+
__m512i S8 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1940
|
+
__m512i S9 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1941
|
+
__m512i S10 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1942
|
+
__m512i S11 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1943
|
+
__m512i S12 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1944
|
+
__m512i S13 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1945
|
+
__m512i S14 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
|
|
1946
|
+
__m512i S15 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
|
|
1947
|
+
|
|
1948
|
+
EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
|
|
1949
|
+
EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
|
|
1950
|
+
EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
|
|
1951
|
+
EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
|
|
1952
|
+
EIGEN_EXTRACT_8i_FROM_16i(S4, S4);
|
|
1953
|
+
EIGEN_EXTRACT_8i_FROM_16i(S5, S5);
|
|
1954
|
+
EIGEN_EXTRACT_8i_FROM_16i(S6, S6);
|
|
1955
|
+
EIGEN_EXTRACT_8i_FROM_16i(S7, S7);
|
|
1956
|
+
EIGEN_EXTRACT_8i_FROM_16i(S8, S8);
|
|
1957
|
+
EIGEN_EXTRACT_8i_FROM_16i(S9, S9);
|
|
1958
|
+
EIGEN_EXTRACT_8i_FROM_16i(S10, S10);
|
|
1959
|
+
EIGEN_EXTRACT_8i_FROM_16i(S11, S11);
|
|
1960
|
+
EIGEN_EXTRACT_8i_FROM_16i(S12, S12);
|
|
1961
|
+
EIGEN_EXTRACT_8i_FROM_16i(S13, S13);
|
|
1962
|
+
EIGEN_EXTRACT_8i_FROM_16i(S14, S14);
|
|
1963
|
+
EIGEN_EXTRACT_8i_FROM_16i(S15, S15);
|
|
1964
|
+
|
|
1965
|
+
PacketBlock<Packet8i, 32> tmp;
|
|
1966
|
+
|
|
1967
|
+
tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S4_0, 0x20);
|
|
1968
|
+
tmp.packet[1] = _mm256_permute2f128_si256(S1_0, S5_0, 0x20);
|
|
1969
|
+
tmp.packet[2] = _mm256_permute2f128_si256(S2_0, S6_0, 0x20);
|
|
1970
|
+
tmp.packet[3] = _mm256_permute2f128_si256(S3_0, S7_0, 0x20);
|
|
1971
|
+
tmp.packet[4] = _mm256_permute2f128_si256(S0_0, S4_0, 0x31);
|
|
1972
|
+
tmp.packet[5] = _mm256_permute2f128_si256(S1_0, S5_0, 0x31);
|
|
1973
|
+
tmp.packet[6] = _mm256_permute2f128_si256(S2_0, S6_0, 0x31);
|
|
1974
|
+
tmp.packet[7] = _mm256_permute2f128_si256(S3_0, S7_0, 0x31);
|
|
1975
|
+
|
|
1976
|
+
tmp.packet[8] = _mm256_permute2f128_si256(S0_1, S4_1, 0x20);
|
|
1977
|
+
tmp.packet[9] = _mm256_permute2f128_si256(S1_1, S5_1, 0x20);
|
|
1978
|
+
tmp.packet[10] = _mm256_permute2f128_si256(S2_1, S6_1, 0x20);
|
|
1979
|
+
tmp.packet[11] = _mm256_permute2f128_si256(S3_1, S7_1, 0x20);
|
|
1980
|
+
tmp.packet[12] = _mm256_permute2f128_si256(S0_1, S4_1, 0x31);
|
|
1981
|
+
tmp.packet[13] = _mm256_permute2f128_si256(S1_1, S5_1, 0x31);
|
|
1982
|
+
tmp.packet[14] = _mm256_permute2f128_si256(S2_1, S6_1, 0x31);
|
|
1983
|
+
tmp.packet[15] = _mm256_permute2f128_si256(S3_1, S7_1, 0x31);
|
|
1984
|
+
|
|
1985
|
+
// Second set of _m256 outputs
|
|
1986
|
+
tmp.packet[16] = _mm256_permute2f128_si256(S8_0, S12_0, 0x20);
|
|
1987
|
+
tmp.packet[17] = _mm256_permute2f128_si256(S9_0, S13_0, 0x20);
|
|
1988
|
+
tmp.packet[18] = _mm256_permute2f128_si256(S10_0, S14_0, 0x20);
|
|
1989
|
+
tmp.packet[19] = _mm256_permute2f128_si256(S11_0, S15_0, 0x20);
|
|
1990
|
+
tmp.packet[20] = _mm256_permute2f128_si256(S8_0, S12_0, 0x31);
|
|
1991
|
+
tmp.packet[21] = _mm256_permute2f128_si256(S9_0, S13_0, 0x31);
|
|
1992
|
+
tmp.packet[22] = _mm256_permute2f128_si256(S10_0, S14_0, 0x31);
|
|
1993
|
+
tmp.packet[23] = _mm256_permute2f128_si256(S11_0, S15_0, 0x31);
|
|
1994
|
+
|
|
1995
|
+
tmp.packet[24] = _mm256_permute2f128_si256(S8_1, S12_1, 0x20);
|
|
1996
|
+
tmp.packet[25] = _mm256_permute2f128_si256(S9_1, S13_1, 0x20);
|
|
1997
|
+
tmp.packet[26] = _mm256_permute2f128_si256(S10_1, S14_1, 0x20);
|
|
1998
|
+
tmp.packet[27] = _mm256_permute2f128_si256(S11_1, S15_1, 0x20);
|
|
1999
|
+
tmp.packet[28] = _mm256_permute2f128_si256(S8_1, S12_1, 0x31);
|
|
2000
|
+
tmp.packet[29] = _mm256_permute2f128_si256(S9_1, S13_1, 0x31);
|
|
2001
|
+
tmp.packet[30] = _mm256_permute2f128_si256(S10_1, S14_1, 0x31);
|
|
2002
|
+
tmp.packet[31] = _mm256_permute2f128_si256(S11_1, S15_1, 0x31);
|
|
2003
|
+
|
|
2004
|
+
// Pack them into the output
|
|
2005
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 0, 16);
|
|
2006
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 1, 16);
|
|
2007
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 2, 16);
|
|
2008
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 3, 16);
|
|
2009
|
+
|
|
2010
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 4, 16);
|
|
2011
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 5, 16);
|
|
2012
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 6, 16);
|
|
2013
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 7, 16);
|
|
2014
|
+
|
|
2015
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 8, 16);
|
|
2016
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 9, 16);
|
|
2017
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 10, 16);
|
|
2018
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 11, 16);
|
|
2019
|
+
|
|
2020
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 12, 16);
|
|
2021
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 13, 16);
|
|
2022
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 14, 16);
|
|
2023
|
+
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 15, 16);
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2026
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
|
|
2027
|
+
__m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
|
|
2028
|
+
__m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
|
|
2029
|
+
__m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
|
|
2030
|
+
__m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
|
|
2031
|
+
|
|
2032
|
+
__m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
|
|
2033
|
+
__m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
|
|
2034
|
+
__m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
|
|
2035
|
+
__m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
|
|
2036
|
+
|
|
2037
|
+
EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
|
|
2038
|
+
EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
|
|
2039
|
+
EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
|
|
2040
|
+
EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
|
|
2041
|
+
|
|
2042
|
+
PacketBlock<Packet8i, 8> tmp;
|
|
2043
|
+
|
|
2044
|
+
tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S1_0, 0x20);
|
|
2045
|
+
tmp.packet[1] = _mm256_permute2f128_si256(S2_0, S3_0, 0x20);
|
|
2046
|
+
tmp.packet[2] = _mm256_permute2f128_si256(S0_0, S1_0, 0x31);
|
|
2047
|
+
tmp.packet[3] = _mm256_permute2f128_si256(S2_0, S3_0, 0x31);
|
|
2048
|
+
|
|
2049
|
+
tmp.packet[4] = _mm256_permute2f128_si256(S0_1, S1_1, 0x20);
|
|
2050
|
+
tmp.packet[5] = _mm256_permute2f128_si256(S2_1, S3_1, 0x20);
|
|
2051
|
+
tmp.packet[6] = _mm256_permute2f128_si256(S0_1, S1_1, 0x31);
|
|
2052
|
+
tmp.packet[7] = _mm256_permute2f128_si256(S2_1, S3_1, 0x31);
|
|
2053
|
+
|
|
2054
|
+
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 0, 1);
|
|
2055
|
+
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 1, 1);
|
|
2056
|
+
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 2, 1);
|
|
2057
|
+
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
template <size_t N>
|
|
2061
|
+
EIGEN_STRONG_INLINE int avx512_blend_mask(const Selector<N>& ifPacket) {
|
|
2062
|
+
alignas(__m128i) uint8_t aux[sizeof(__m128i)];
|
|
2063
|
+
for (size_t i = 0; i < N; i++) aux[i] = static_cast<uint8_t>(ifPacket.select[i]);
|
|
2064
|
+
__m128i paux = _mm_sub_epi8(_mm_setzero_si128(), _mm_load_si128(reinterpret_cast<const __m128i*>(aux)));
|
|
2065
|
+
return _mm_movemask_epi8(paux);
|
|
2066
|
+
}
|
|
2067
|
+
|
|
2068
|
+
template <>
|
|
2069
|
+
EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
|
|
2070
|
+
const Packet16f& elsePacket) {
|
|
2071
|
+
__mmask16 m = avx512_blend_mask(ifPacket);
|
|
2072
|
+
return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
|
|
2073
|
+
}
|
|
2074
|
+
template <>
|
|
2075
|
+
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
|
|
1365
2076
|
const Packet8d& elsePacket) {
|
|
1366
|
-
__mmask8 m = (ifPacket
|
|
1367
|
-
| (ifPacket.select[1]<<1)
|
|
1368
|
-
| (ifPacket.select[2]<<2)
|
|
1369
|
-
| (ifPacket.select[3]<<3)
|
|
1370
|
-
| (ifPacket.select[4]<<4)
|
|
1371
|
-
| (ifPacket.select[5]<<5)
|
|
1372
|
-
| (ifPacket.select[6]<<6)
|
|
1373
|
-
| (ifPacket.select[7]<<7);
|
|
2077
|
+
__mmask8 m = avx512_blend_mask(ifPacket);
|
|
1374
2078
|
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
|
|
1375
2079
|
}
|
|
1376
2080
|
|
|
1377
2081
|
// Packet math for Eigen::half
|
|
1378
|
-
|
|
2082
|
+
#ifndef EIGEN_VECTORIZE_AVX512FP16
|
|
2083
|
+
template <>
|
|
2084
|
+
EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
|
|
1379
2085
|
return _mm256_set1_epi16(from.x);
|
|
1380
2086
|
}
|
|
1381
2087
|
|
|
1382
|
-
template<>
|
|
2088
|
+
template <>
|
|
2089
|
+
EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
|
|
1383
2090
|
return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
|
|
1384
2091
|
}
|
|
1385
2092
|
|
|
1386
|
-
template<>
|
|
2093
|
+
template <>
|
|
2094
|
+
EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
|
|
1387
2095
|
return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
|
|
1388
2096
|
}
|
|
1389
2097
|
|
|
1390
|
-
template<>
|
|
2098
|
+
template <>
|
|
2099
|
+
EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
|
|
1391
2100
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
|
|
1392
2101
|
}
|
|
1393
2102
|
|
|
1394
|
-
template<>
|
|
2103
|
+
template <>
|
|
2104
|
+
EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
|
|
1395
2105
|
// (void*) -> workaround clang warning:
|
|
1396
2106
|
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
|
|
2107
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
1397
2108
|
_mm256_store_si256((__m256i*)(void*)to, from);
|
|
1398
2109
|
}
|
|
1399
2110
|
|
|
1400
|
-
template<>
|
|
2111
|
+
template <>
|
|
2112
|
+
EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
|
|
1401
2113
|
// (void*) -> workaround clang warning:
|
|
1402
2114
|
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
|
|
2115
|
+
EIGEN_DEBUG_UNALIGNED_STORE
|
|
1403
2116
|
_mm256_storeu_si256((__m256i*)(void*)to, from);
|
|
1404
2117
|
}
|
|
1405
2118
|
|
|
1406
|
-
template<>
|
|
1407
|
-
ploaddup<Packet16h>(const Eigen::half*
|
|
2119
|
+
template <>
|
|
2120
|
+
EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
|
|
1408
2121
|
unsigned short a = from[0].x;
|
|
1409
2122
|
unsigned short b = from[1].x;
|
|
1410
2123
|
unsigned short c = from[2].x;
|
|
@@ -1416,8 +2129,8 @@ ploaddup<Packet16h>(const Eigen::half* from) {
|
|
|
1416
2129
|
return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
|
|
1417
2130
|
}
|
|
1418
2131
|
|
|
1419
|
-
template<>
|
|
1420
|
-
ploadquad(const Eigen::half* from) {
|
|
2132
|
+
template <>
|
|
2133
|
+
EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) {
|
|
1421
2134
|
unsigned short a = from[0].x;
|
|
1422
2135
|
unsigned short b = from[1].x;
|
|
1423
2136
|
unsigned short c = from[2].x;
|
|
@@ -1425,65 +2138,15 @@ ploadquad(const Eigen::half* from) {
|
|
|
1425
2138
|
return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
|
|
1426
2139
|
}
|
|
1427
2140
|
|
|
1428
|
-
EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
|
|
1429
|
-
#ifdef EIGEN_HAS_FP16_C
|
|
1430
|
-
return _mm512_cvtph_ps(a);
|
|
1431
|
-
#else
|
|
1432
|
-
EIGEN_ALIGN64 half aux[16];
|
|
1433
|
-
pstore(aux, a);
|
|
1434
|
-
float f0(aux[0]);
|
|
1435
|
-
float f1(aux[1]);
|
|
1436
|
-
float f2(aux[2]);
|
|
1437
|
-
float f3(aux[3]);
|
|
1438
|
-
float f4(aux[4]);
|
|
1439
|
-
float f5(aux[5]);
|
|
1440
|
-
float f6(aux[6]);
|
|
1441
|
-
float f7(aux[7]);
|
|
1442
|
-
float f8(aux[8]);
|
|
1443
|
-
float f9(aux[9]);
|
|
1444
|
-
float fa(aux[10]);
|
|
1445
|
-
float fb(aux[11]);
|
|
1446
|
-
float fc(aux[12]);
|
|
1447
|
-
float fd(aux[13]);
|
|
1448
|
-
float fe(aux[14]);
|
|
1449
|
-
float ff(aux[15]);
|
|
1450
|
-
|
|
1451
|
-
return _mm512_set_ps(
|
|
1452
|
-
ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
|
|
1453
|
-
#endif
|
|
1454
|
-
}
|
|
2141
|
+
EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtph_ps(a); }
|
|
1455
2142
|
|
|
1456
2143
|
EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
|
|
1457
|
-
|
|
1458
|
-
return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
|
|
1459
|
-
#else
|
|
1460
|
-
EIGEN_ALIGN64 float aux[16];
|
|
1461
|
-
pstore(aux, a);
|
|
1462
|
-
half h0(aux[0]);
|
|
1463
|
-
half h1(aux[1]);
|
|
1464
|
-
half h2(aux[2]);
|
|
1465
|
-
half h3(aux[3]);
|
|
1466
|
-
half h4(aux[4]);
|
|
1467
|
-
half h5(aux[5]);
|
|
1468
|
-
half h6(aux[6]);
|
|
1469
|
-
half h7(aux[7]);
|
|
1470
|
-
half h8(aux[8]);
|
|
1471
|
-
half h9(aux[9]);
|
|
1472
|
-
half ha(aux[10]);
|
|
1473
|
-
half hb(aux[11]);
|
|
1474
|
-
half hc(aux[12]);
|
|
1475
|
-
half hd(aux[13]);
|
|
1476
|
-
half he(aux[14]);
|
|
1477
|
-
half hf(aux[15]);
|
|
1478
|
-
|
|
1479
|
-
return _mm256_set_epi16(
|
|
1480
|
-
hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
|
|
1481
|
-
h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
|
|
1482
|
-
#endif
|
|
2144
|
+
return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
|
1483
2145
|
}
|
|
1484
2146
|
|
|
1485
|
-
template<>
|
|
1486
|
-
|
|
2147
|
+
template <>
|
|
2148
|
+
EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
|
|
2149
|
+
return Packet16h(ptrue(Packet8i(a)));
|
|
1487
2150
|
}
|
|
1488
2151
|
|
|
1489
2152
|
template <>
|
|
@@ -1493,14 +2156,12 @@ EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
|
|
|
1493
2156
|
}
|
|
1494
2157
|
|
|
1495
2158
|
template <>
|
|
1496
|
-
EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
|
|
1497
|
-
const Packet16h& b) {
|
|
2159
|
+
EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
1498
2160
|
return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
|
|
1499
2161
|
}
|
|
1500
2162
|
|
|
1501
2163
|
template <>
|
|
1502
|
-
EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
|
|
1503
|
-
const Packet16h& b) {
|
|
2164
|
+
EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
1504
2165
|
return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
|
|
1505
2166
|
}
|
|
1506
2167
|
|
|
@@ -1509,164 +2170,185 @@ EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
|
|
|
1509
2170
|
return float2half(plset<Packet16f>(static_cast<float>(a)));
|
|
1510
2171
|
}
|
|
1511
2172
|
|
|
1512
|
-
template<>
|
|
2173
|
+
template <>
|
|
2174
|
+
EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
|
|
1513
2175
|
// in some cases Packet8i is a wrapper around __m256i, so we need to
|
|
1514
2176
|
// cast to Packet8i to call the correct overload.
|
|
1515
|
-
return por(Packet8i(a),Packet8i(b));
|
|
2177
|
+
return Packet16h(por(Packet8i(a), Packet8i(b)));
|
|
1516
2178
|
}
|
|
1517
|
-
template<>
|
|
1518
|
-
|
|
2179
|
+
template <>
|
|
2180
|
+
EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
|
|
2181
|
+
return Packet16h(pxor(Packet8i(a), Packet8i(b)));
|
|
1519
2182
|
}
|
|
1520
|
-
template<>
|
|
1521
|
-
|
|
2183
|
+
template <>
|
|
2184
|
+
EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
|
|
2185
|
+
return Packet16h(pand(Packet8i(a), Packet8i(b)));
|
|
1522
2186
|
}
|
|
1523
|
-
template<>
|
|
1524
|
-
|
|
2187
|
+
template <>
|
|
2188
|
+
EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
|
|
2189
|
+
return Packet16h(pandnot(Packet8i(a), Packet8i(b)));
|
|
1525
2190
|
}
|
|
1526
2191
|
|
|
1527
|
-
template<>
|
|
2192
|
+
template <>
|
|
2193
|
+
EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
|
|
1528
2194
|
return _mm256_blendv_epi8(b, a, mask);
|
|
1529
2195
|
}
|
|
1530
2196
|
|
|
1531
|
-
template<>
|
|
2197
|
+
template <>
|
|
2198
|
+
EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
|
|
1532
2199
|
return float2half(pround<Packet16f>(half2float(a)));
|
|
1533
2200
|
}
|
|
1534
2201
|
|
|
1535
|
-
template<>
|
|
2202
|
+
template <>
|
|
2203
|
+
EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
|
|
1536
2204
|
return float2half(print<Packet16f>(half2float(a)));
|
|
1537
2205
|
}
|
|
1538
2206
|
|
|
1539
|
-
template<>
|
|
2207
|
+
template <>
|
|
2208
|
+
EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
|
|
1540
2209
|
return float2half(pceil<Packet16f>(half2float(a)));
|
|
1541
2210
|
}
|
|
1542
2211
|
|
|
1543
|
-
template<>
|
|
2212
|
+
template <>
|
|
2213
|
+
EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
|
|
1544
2214
|
return float2half(pfloor<Packet16f>(half2float(a)));
|
|
1545
2215
|
}
|
|
1546
2216
|
|
|
1547
|
-
template<>
|
|
2217
|
+
template <>
|
|
2218
|
+
EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
|
|
2219
|
+
return float2half(ptrunc<Packet16f>(half2float(a)));
|
|
2220
|
+
}
|
|
2221
|
+
|
|
2222
|
+
template <>
|
|
2223
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
|
|
1548
2224
|
Packet16f af = half2float(a);
|
|
1549
2225
|
Packet16f bf = half2float(b);
|
|
1550
2226
|
return Pack32To16(pcmp_eq(af, bf));
|
|
1551
2227
|
}
|
|
1552
2228
|
|
|
1553
|
-
template<>
|
|
2229
|
+
template <>
|
|
2230
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
|
|
1554
2231
|
return Pack32To16(pcmp_le(half2float(a), half2float(b)));
|
|
1555
2232
|
}
|
|
1556
2233
|
|
|
1557
|
-
template<>
|
|
2234
|
+
template <>
|
|
2235
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
|
|
1558
2236
|
return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
|
|
1559
2237
|
}
|
|
1560
2238
|
|
|
1561
|
-
template<>
|
|
2239
|
+
template <>
|
|
2240
|
+
EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
|
|
1562
2241
|
return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
|
|
1563
2242
|
}
|
|
1564
2243
|
|
|
1565
|
-
template<>
|
|
2244
|
+
template <>
|
|
2245
|
+
EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) {
|
|
2246
|
+
return a;
|
|
2247
|
+
}
|
|
1566
2248
|
|
|
1567
|
-
template<>
|
|
2249
|
+
template <>
|
|
2250
|
+
EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
|
|
1568
2251
|
Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
|
|
1569
2252
|
return _mm256_xor_si256(a, sign_mask);
|
|
1570
2253
|
}
|
|
1571
2254
|
|
|
1572
|
-
template<>
|
|
2255
|
+
template <>
|
|
2256
|
+
EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
1573
2257
|
Packet16f af = half2float(a);
|
|
1574
2258
|
Packet16f bf = half2float(b);
|
|
1575
2259
|
Packet16f rf = padd(af, bf);
|
|
1576
2260
|
return float2half(rf);
|
|
1577
2261
|
}
|
|
1578
2262
|
|
|
1579
|
-
template<>
|
|
2263
|
+
template <>
|
|
2264
|
+
EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
1580
2265
|
Packet16f af = half2float(a);
|
|
1581
2266
|
Packet16f bf = half2float(b);
|
|
1582
2267
|
Packet16f rf = psub(af, bf);
|
|
1583
2268
|
return float2half(rf);
|
|
1584
2269
|
}
|
|
1585
2270
|
|
|
1586
|
-
template<>
|
|
2271
|
+
template <>
|
|
2272
|
+
EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
1587
2273
|
Packet16f af = half2float(a);
|
|
1588
2274
|
Packet16f bf = half2float(b);
|
|
1589
2275
|
Packet16f rf = pmul(af, bf);
|
|
1590
2276
|
return float2half(rf);
|
|
1591
2277
|
}
|
|
1592
2278
|
|
|
1593
|
-
template<>
|
|
2279
|
+
template <>
|
|
2280
|
+
EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
|
1594
2281
|
Packet16f af = half2float(a);
|
|
1595
2282
|
Packet16f bf = half2float(b);
|
|
1596
2283
|
Packet16f rf = pdiv(af, bf);
|
|
1597
2284
|
return float2half(rf);
|
|
1598
2285
|
}
|
|
1599
2286
|
|
|
1600
|
-
template<>
|
|
1601
|
-
|
|
1602
|
-
return
|
|
2287
|
+
template <>
|
|
2288
|
+
EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
2289
|
+
return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
|
|
1603
2290
|
}
|
|
1604
2291
|
|
|
1605
2292
|
template <>
|
|
1606
|
-
EIGEN_STRONG_INLINE
|
|
1607
|
-
|
|
1608
|
-
Packet8h lane1 = _mm256_extractf128_si256(a, 1);
|
|
1609
|
-
return padd<Packet8h>(lane0, lane1);
|
|
2293
|
+
EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
2294
|
+
return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
|
|
1610
2295
|
}
|
|
1611
2296
|
|
|
1612
|
-
template<>
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
return Eigen::half(reduced);
|
|
2297
|
+
template <>
|
|
2298
|
+
EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
2299
|
+
return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
|
|
1616
2300
|
}
|
|
1617
2301
|
|
|
1618
|
-
template<>
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
return Eigen::half(reduced);
|
|
2302
|
+
template <>
|
|
2303
|
+
EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
|
|
2304
|
+
return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
|
|
1622
2305
|
}
|
|
1623
2306
|
|
|
1624
|
-
template<>
|
|
1625
|
-
|
|
1626
|
-
|
|
2307
|
+
template <>
|
|
2308
|
+
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
|
|
2309
|
+
Packet8h lane0 = _mm256_extractf128_si256(a, 0);
|
|
2310
|
+
Packet8h lane1 = _mm256_extractf128_si256(a, 1);
|
|
2311
|
+
return padd<Packet8h>(lane0, lane1);
|
|
1627
2312
|
}
|
|
1628
2313
|
|
|
1629
|
-
template<>
|
|
1630
|
-
{
|
|
1631
|
-
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
|
|
1632
|
-
return _mm256_insertf128_si256(
|
|
1633
|
-
|
|
1634
|
-
_mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
|
|
2314
|
+
template <>
|
|
2315
|
+
EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
|
|
2316
|
+
__m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
|
2317
|
+
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a, 1), m)),
|
|
2318
|
+
_mm_shuffle_epi8(_mm256_extractf128_si256(a, 0), m), 1);
|
|
1635
2319
|
}
|
|
1636
2320
|
|
|
1637
|
-
template<>
|
|
1638
|
-
{
|
|
1639
|
-
return _mm256_set_epi16(
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
|
|
2321
|
+
template <>
|
|
2322
|
+
EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
|
|
2323
|
+
return _mm256_set_epi16(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
|
|
2324
|
+
from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
|
|
2325
|
+
from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
|
|
2326
|
+
from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
|
|
1644
2327
|
}
|
|
1645
2328
|
|
|
1646
|
-
template<>
|
|
1647
|
-
{
|
|
2329
|
+
template <>
|
|
2330
|
+
EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
|
|
1648
2331
|
EIGEN_ALIGN64 half aux[16];
|
|
1649
2332
|
pstore(aux, from);
|
|
1650
|
-
to[stride*0] = aux[0];
|
|
1651
|
-
to[stride*1] = aux[1];
|
|
1652
|
-
to[stride*2] = aux[2];
|
|
1653
|
-
to[stride*3] = aux[3];
|
|
1654
|
-
to[stride*4] = aux[4];
|
|
1655
|
-
to[stride*5] = aux[5];
|
|
1656
|
-
to[stride*6] = aux[6];
|
|
1657
|
-
to[stride*7] = aux[7];
|
|
1658
|
-
to[stride*8] = aux[8];
|
|
1659
|
-
to[stride*9] = aux[9];
|
|
1660
|
-
to[stride*10] = aux[10];
|
|
1661
|
-
to[stride*11] = aux[11];
|
|
1662
|
-
to[stride*12] = aux[12];
|
|
1663
|
-
to[stride*13] = aux[13];
|
|
1664
|
-
to[stride*14] = aux[14];
|
|
1665
|
-
to[stride*15] = aux[15];
|
|
1666
|
-
}
|
|
1667
|
-
|
|
1668
|
-
EIGEN_STRONG_INLINE void
|
|
1669
|
-
ptranspose(PacketBlock<Packet16h,16>& kernel) {
|
|
2333
|
+
to[stride * 0] = aux[0];
|
|
2334
|
+
to[stride * 1] = aux[1];
|
|
2335
|
+
to[stride * 2] = aux[2];
|
|
2336
|
+
to[stride * 3] = aux[3];
|
|
2337
|
+
to[stride * 4] = aux[4];
|
|
2338
|
+
to[stride * 5] = aux[5];
|
|
2339
|
+
to[stride * 6] = aux[6];
|
|
2340
|
+
to[stride * 7] = aux[7];
|
|
2341
|
+
to[stride * 8] = aux[8];
|
|
2342
|
+
to[stride * 9] = aux[9];
|
|
2343
|
+
to[stride * 10] = aux[10];
|
|
2344
|
+
to[stride * 11] = aux[11];
|
|
2345
|
+
to[stride * 12] = aux[12];
|
|
2346
|
+
to[stride * 13] = aux[13];
|
|
2347
|
+
to[stride * 14] = aux[14];
|
|
2348
|
+
to[stride * 15] = aux[15];
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
|
|
1670
2352
|
__m256i a = kernel.packet[0];
|
|
1671
2353
|
__m256i b = kernel.packet[1];
|
|
1672
2354
|
__m256i c = kernel.packet[2];
|
|
@@ -1773,8 +2455,7 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
|
|
|
1773
2455
|
kernel.packet[15] = a_p_f;
|
|
1774
2456
|
}
|
|
1775
2457
|
|
|
1776
|
-
EIGEN_STRONG_INLINE void
|
|
1777
|
-
ptranspose(PacketBlock<Packet16h,8>& kernel) {
|
|
2458
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
|
|
1778
2459
|
EIGEN_ALIGN64 half in[8][16];
|
|
1779
2460
|
pstore<half>(in[0], kernel.packet[0]);
|
|
1780
2461
|
pstore<half>(in[1], kernel.packet[1]);
|
|
@@ -1789,10 +2470,10 @@ ptranspose(PacketBlock<Packet16h,8>& kernel) {
|
|
|
1789
2470
|
|
|
1790
2471
|
for (int i = 0; i < 8; ++i) {
|
|
1791
2472
|
for (int j = 0; j < 8; ++j) {
|
|
1792
|
-
out[i][j] = in[j][2*i];
|
|
2473
|
+
out[i][j] = in[j][2 * i];
|
|
1793
2474
|
}
|
|
1794
2475
|
for (int j = 0; j < 8; ++j) {
|
|
1795
|
-
out[i][j+8] = in[j][2*i+1];
|
|
2476
|
+
out[i][j + 8] = in[j][2 * i + 1];
|
|
1796
2477
|
}
|
|
1797
2478
|
}
|
|
1798
2479
|
|
|
@@ -1806,8 +2487,7 @@ ptranspose(PacketBlock<Packet16h,8>& kernel) {
|
|
|
1806
2487
|
kernel.packet[7] = pload<Packet16h>(out[7]);
|
|
1807
2488
|
}
|
|
1808
2489
|
|
|
1809
|
-
EIGEN_STRONG_INLINE void
|
|
1810
|
-
ptranspose(PacketBlock<Packet16h,4>& kernel) {
|
|
2490
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
|
|
1811
2491
|
EIGEN_ALIGN64 half in[4][16];
|
|
1812
2492
|
pstore<half>(in[0], kernel.packet[0]);
|
|
1813
2493
|
pstore<half>(in[1], kernel.packet[1]);
|
|
@@ -1818,16 +2498,16 @@ ptranspose(PacketBlock<Packet16h,4>& kernel) {
|
|
|
1818
2498
|
|
|
1819
2499
|
for (int i = 0; i < 4; ++i) {
|
|
1820
2500
|
for (int j = 0; j < 4; ++j) {
|
|
1821
|
-
out[i][j] = in[j][4*i];
|
|
2501
|
+
out[i][j] = in[j][4 * i];
|
|
1822
2502
|
}
|
|
1823
2503
|
for (int j = 0; j < 4; ++j) {
|
|
1824
|
-
out[i][j+4] = in[j][4*i+1];
|
|
2504
|
+
out[i][j + 4] = in[j][4 * i + 1];
|
|
1825
2505
|
}
|
|
1826
2506
|
for (int j = 0; j < 4; ++j) {
|
|
1827
|
-
out[i][j+8] = in[j][4*i+2];
|
|
2507
|
+
out[i][j + 8] = in[j][4 * i + 2];
|
|
1828
2508
|
}
|
|
1829
2509
|
for (int j = 0; j < 4; ++j) {
|
|
1830
|
-
out[i][j+12] = in[j][4*i+3];
|
|
2510
|
+
out[i][j + 12] = in[j][4 * i + 3];
|
|
1831
2511
|
}
|
|
1832
2512
|
}
|
|
1833
2513
|
|
|
@@ -1837,7 +2517,12 @@ ptranspose(PacketBlock<Packet16h,4>& kernel) {
|
|
|
1837
2517
|
kernel.packet[3] = pload<Packet16h>(out[3]);
|
|
1838
2518
|
}
|
|
1839
2519
|
|
|
1840
|
-
|
|
2520
|
+
#endif // EIGEN_VECTORIZE_AVX512FP16
|
|
2521
|
+
|
|
2522
|
+
template <>
|
|
2523
|
+
struct is_arithmetic<Packet16bf> {
|
|
2524
|
+
enum { value = true };
|
|
2525
|
+
};
|
|
1841
2526
|
|
|
1842
2527
|
template <>
|
|
1843
2528
|
struct packet_traits<bfloat16> : default_packet_traits {
|
|
@@ -1847,35 +2532,37 @@ struct packet_traits<bfloat16> : default_packet_traits {
|
|
|
1847
2532
|
Vectorizable = 1,
|
|
1848
2533
|
AlignedOnScalar = 1,
|
|
1849
2534
|
size = 16,
|
|
1850
|
-
HasHalfPacket = 1,
|
|
1851
2535
|
HasBlend = 0,
|
|
1852
2536
|
HasInsert = 1,
|
|
1853
2537
|
HasSin = EIGEN_FAST_MATH,
|
|
1854
2538
|
HasCos = EIGEN_FAST_MATH,
|
|
1855
|
-
|
|
2539
|
+
HasSqrt = 1,
|
|
2540
|
+
HasRsqrt = 1,
|
|
1856
2541
|
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
|
1857
2542
|
HasLog = 1, // Currently fails test with bad accuracy.
|
|
1858
|
-
HasLog1p
|
|
1859
|
-
HasExpm1
|
|
2543
|
+
HasLog1p = 1,
|
|
2544
|
+
HasExpm1 = 1,
|
|
1860
2545
|
HasNdtri = 1,
|
|
1861
|
-
HasBessel
|
|
2546
|
+
HasBessel = 1,
|
|
1862
2547
|
#endif
|
|
1863
2548
|
HasExp = 1,
|
|
1864
|
-
HasSqrt = EIGEN_FAST_MATH,
|
|
1865
|
-
HasRsqrt = EIGEN_FAST_MATH,
|
|
1866
2549
|
HasTanh = EIGEN_FAST_MATH,
|
|
1867
2550
|
HasErf = EIGEN_FAST_MATH,
|
|
1868
|
-
|
|
1869
|
-
HasCmp = 1,
|
|
2551
|
+
HasCmp = 1,
|
|
1870
2552
|
HasDiv = 1
|
|
1871
2553
|
};
|
|
1872
2554
|
};
|
|
1873
2555
|
|
|
1874
2556
|
template <>
|
|
1875
|
-
struct unpacket_traits<Packet16bf>
|
|
1876
|
-
{
|
|
2557
|
+
struct unpacket_traits<Packet16bf> {
|
|
1877
2558
|
typedef bfloat16 type;
|
|
1878
|
-
enum {
|
|
2559
|
+
enum {
|
|
2560
|
+
size = 16,
|
|
2561
|
+
alignment = Aligned32,
|
|
2562
|
+
vectorizable = true,
|
|
2563
|
+
masked_load_available = false,
|
|
2564
|
+
masked_store_available = false
|
|
2565
|
+
};
|
|
1879
2566
|
typedef Packet8bf half;
|
|
1880
2567
|
};
|
|
1881
2568
|
|
|
@@ -1902,20 +2589,19 @@ EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
|
|
|
1902
2589
|
}
|
|
1903
2590
|
|
|
1904
2591
|
template <>
|
|
1905
|
-
EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
|
|
1906
|
-
|
|
2592
|
+
EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
|
|
2593
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
1907
2594
|
_mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
|
|
1908
2595
|
}
|
|
1909
2596
|
|
|
1910
2597
|
template <>
|
|
1911
|
-
EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
|
|
1912
|
-
|
|
2598
|
+
EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
|
|
2599
|
+
EIGEN_DEBUG_UNALIGNED_STORE
|
|
1913
2600
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
|
|
1914
2601
|
}
|
|
1915
2602
|
|
|
1916
|
-
template<>
|
|
1917
|
-
ploaddup<Packet16bf>(const bfloat16* from) {
|
|
1918
|
-
Packet16bf r;
|
|
2603
|
+
template <>
|
|
2604
|
+
EIGEN_STRONG_INLINE Packet16bf ploaddup<Packet16bf>(const bfloat16* from) {
|
|
1919
2605
|
unsigned short a = from[0].value;
|
|
1920
2606
|
unsigned short b = from[1].value;
|
|
1921
2607
|
unsigned short c = from[2].value;
|
|
@@ -1927,9 +2613,8 @@ ploaddup<Packet16bf>(const bfloat16* from) {
|
|
|
1927
2613
|
return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
|
|
1928
2614
|
}
|
|
1929
2615
|
|
|
1930
|
-
template<>
|
|
1931
|
-
ploadquad(const bfloat16* from) {
|
|
1932
|
-
Packet16bf r;
|
|
2616
|
+
template <>
|
|
2617
|
+
EIGEN_STRONG_INLINE Packet16bf ploadquad(const bfloat16* from) {
|
|
1933
2618
|
unsigned short a = from[0].value;
|
|
1934
2619
|
unsigned short b = from[1].value;
|
|
1935
2620
|
unsigned short c = from[2].value;
|
|
@@ -1945,9 +2630,9 @@ EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
|
|
|
1945
2630
|
EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
|
|
1946
2631
|
Packet16bf r;
|
|
1947
2632
|
|
|
1948
|
-
#if defined(EIGEN_VECTORIZE_AVX512BF16) &&
|
|
2633
|
+
#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10, 1, 0)
|
|
1949
2634
|
// Since GCC 10.1 supports avx512bf16 and C style explicit cast
|
|
1950
|
-
// (C++ static_cast is not supported yet), do
|
|
2635
|
+
// (C++ static_cast is not supported yet), do conversion via intrinsic
|
|
1951
2636
|
// and register path for performance.
|
|
1952
2637
|
r = (__m256i)(_mm512_cvtneps_pbh(a));
|
|
1953
2638
|
|
|
@@ -1971,84 +2656,85 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
|
|
|
1971
2656
|
t = _mm512_mask_blend_epi32(mask, nan, t);
|
|
1972
2657
|
// output.value = static_cast<uint16_t>(input);
|
|
1973
2658
|
r = _mm512_cvtepi32_epi16(t);
|
|
1974
|
-
#endif
|
|
2659
|
+
#endif // EIGEN_VECTORIZE_AVX512BF16
|
|
1975
2660
|
|
|
1976
2661
|
return r;
|
|
1977
2662
|
}
|
|
1978
2663
|
|
|
1979
2664
|
template <>
|
|
1980
2665
|
EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) {
|
|
1981
|
-
return ptrue<Packet8i>(a);
|
|
2666
|
+
return Packet16bf(ptrue<Packet8i>(Packet8i(a)));
|
|
1982
2667
|
}
|
|
1983
2668
|
|
|
1984
2669
|
template <>
|
|
1985
2670
|
EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) {
|
|
1986
|
-
return por<Packet8i>(a, b);
|
|
2671
|
+
return Packet16bf(por<Packet8i>(Packet8i(a), Packet8i(b)));
|
|
1987
2672
|
}
|
|
1988
2673
|
|
|
1989
2674
|
template <>
|
|
1990
2675
|
EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) {
|
|
1991
|
-
return pxor<Packet8i>(a, b);
|
|
2676
|
+
return Packet16bf(pxor<Packet8i>(Packet8i(a), Packet8i(b)));
|
|
1992
2677
|
}
|
|
1993
2678
|
|
|
1994
2679
|
template <>
|
|
1995
2680
|
EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) {
|
|
1996
|
-
return pand<Packet8i>(a, b);
|
|
2681
|
+
return Packet16bf(pand<Packet8i>(Packet8i(a), Packet8i(b)));
|
|
1997
2682
|
}
|
|
1998
2683
|
|
|
1999
2684
|
template <>
|
|
2000
|
-
EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
|
|
2001
|
-
|
|
2002
|
-
return pandnot<Packet8i>(a, b);
|
|
2685
|
+
EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, const Packet16bf& b) {
|
|
2686
|
+
return Packet16bf(pandnot<Packet8i>(Packet8i(a), Packet8i(b)));
|
|
2003
2687
|
}
|
|
2004
2688
|
|
|
2005
2689
|
template <>
|
|
2006
|
-
EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
|
|
2007
|
-
const Packet16bf& a,
|
|
2008
|
-
const Packet16bf& b) {
|
|
2690
|
+
EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, const Packet16bf& a, const Packet16bf& b) {
|
|
2009
2691
|
// Input mask is expected to be all 0/1, handle it with 8-bit
|
|
2010
2692
|
// intrinsic for performance.
|
|
2011
2693
|
return _mm256_blendv_epi8(b, a, mask);
|
|
2012
2694
|
}
|
|
2013
2695
|
|
|
2014
|
-
template<>
|
|
2015
|
-
{
|
|
2696
|
+
template <>
|
|
2697
|
+
EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a) {
|
|
2016
2698
|
return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
|
|
2017
2699
|
}
|
|
2018
2700
|
|
|
2019
|
-
template<>
|
|
2701
|
+
template <>
|
|
2702
|
+
EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
|
|
2020
2703
|
return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
|
|
2021
2704
|
}
|
|
2022
2705
|
|
|
2023
|
-
template<>
|
|
2706
|
+
template <>
|
|
2707
|
+
EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
|
|
2024
2708
|
return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
|
|
2025
2709
|
}
|
|
2026
2710
|
|
|
2027
|
-
template<>
|
|
2711
|
+
template <>
|
|
2712
|
+
EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
|
|
2028
2713
|
return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
|
|
2029
2714
|
}
|
|
2030
2715
|
|
|
2031
2716
|
template <>
|
|
2032
|
-
EIGEN_STRONG_INLINE Packet16bf
|
|
2033
|
-
|
|
2717
|
+
EIGEN_STRONG_INLINE Packet16bf ptrunc<Packet16bf>(const Packet16bf& a) {
|
|
2718
|
+
return F32ToBf16(ptrunc<Packet16f>(Bf16ToF32(a)));
|
|
2719
|
+
}
|
|
2720
|
+
|
|
2721
|
+
template <>
|
|
2722
|
+
EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, const Packet16bf& b) {
|
|
2034
2723
|
return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2035
2724
|
}
|
|
2036
2725
|
|
|
2037
2726
|
template <>
|
|
2038
|
-
EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
|
|
2039
|
-
const Packet16bf& b) {
|
|
2727
|
+
EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, const Packet16bf& b) {
|
|
2040
2728
|
return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2041
2729
|
}
|
|
2042
2730
|
|
|
2043
2731
|
template <>
|
|
2044
|
-
EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
|
|
2045
|
-
const Packet16bf& b) {
|
|
2732
|
+
EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, const Packet16bf& b) {
|
|
2046
2733
|
return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2047
2734
|
}
|
|
2048
2735
|
|
|
2049
2736
|
template <>
|
|
2050
|
-
EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
|
|
2051
|
-
const Packet16bf& b) {
|
|
2737
|
+
EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, const Packet16bf& b) {
|
|
2052
2738
|
return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2053
2739
|
}
|
|
2054
2740
|
|
|
@@ -2070,77 +2756,71 @@ EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
|
|
|
2070
2756
|
}
|
|
2071
2757
|
|
|
2072
2758
|
template <>
|
|
2073
|
-
EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
|
|
2074
|
-
const Packet16bf& b) {
|
|
2759
|
+
EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
|
|
2075
2760
|
return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2076
2761
|
}
|
|
2077
2762
|
|
|
2078
2763
|
template <>
|
|
2079
|
-
EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
|
|
2080
|
-
const Packet16bf& b) {
|
|
2764
|
+
EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
|
|
2081
2765
|
return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2082
2766
|
}
|
|
2083
2767
|
|
|
2084
2768
|
template <>
|
|
2085
|
-
EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
|
|
2086
|
-
|
|
2087
|
-
return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2769
|
+
EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
|
|
2770
|
+
return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2088
2771
|
}
|
|
2089
2772
|
|
|
2090
2773
|
template <>
|
|
2091
|
-
EIGEN_STRONG_INLINE Packet16bf
|
|
2092
|
-
|
|
2093
|
-
return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2774
|
+
EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
|
|
2775
|
+
return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
|
|
2094
2776
|
}
|
|
2095
2777
|
|
|
2096
2778
|
template <>
|
|
2097
|
-
EIGEN_STRONG_INLINE Packet16bf
|
|
2098
|
-
|
|
2099
|
-
return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2779
|
+
EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
|
|
2780
|
+
return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
|
|
2100
2781
|
}
|
|
2101
2782
|
|
|
2102
2783
|
template <>
|
|
2103
|
-
EIGEN_STRONG_INLINE Packet16bf
|
|
2104
|
-
|
|
2105
|
-
return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2784
|
+
EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
|
|
2785
|
+
return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
|
|
2106
2786
|
}
|
|
2107
2787
|
|
|
2108
2788
|
template <>
|
|
2109
|
-
EIGEN_STRONG_INLINE Packet16bf
|
|
2110
|
-
return F32ToBf16(
|
|
2789
|
+
EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
|
|
2790
|
+
return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
|
|
2111
2791
|
}
|
|
2112
2792
|
|
|
2113
2793
|
template <>
|
|
2114
|
-
EIGEN_STRONG_INLINE
|
|
2115
|
-
|
|
2116
|
-
Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
|
|
2117
|
-
return padd<Packet8bf>(lane0, lane1);
|
|
2794
|
+
EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
|
|
2795
|
+
return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2118
2796
|
}
|
|
2119
2797
|
|
|
2120
2798
|
template <>
|
|
2121
|
-
EIGEN_STRONG_INLINE
|
|
2122
|
-
return
|
|
2799
|
+
EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
|
|
2800
|
+
return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2123
2801
|
}
|
|
2124
2802
|
|
|
2125
2803
|
template <>
|
|
2126
|
-
EIGEN_STRONG_INLINE
|
|
2127
|
-
return
|
|
2804
|
+
EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
|
|
2805
|
+
return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
|
|
2128
2806
|
}
|
|
2129
2807
|
|
|
2130
2808
|
template <>
|
|
2131
|
-
EIGEN_STRONG_INLINE
|
|
2132
|
-
return
|
|
2809
|
+
EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
|
|
2810
|
+
return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
|
|
2133
2811
|
}
|
|
2134
2812
|
|
|
2135
2813
|
template <>
|
|
2136
|
-
EIGEN_STRONG_INLINE
|
|
2137
|
-
|
|
2814
|
+
EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
|
|
2815
|
+
Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
|
|
2816
|
+
Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
|
|
2817
|
+
return padd<Packet8bf>(lane0, lane1);
|
|
2138
2818
|
}
|
|
2139
2819
|
|
|
2140
2820
|
template <>
|
|
2141
2821
|
EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
|
|
2142
|
-
__m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
|
|
2143
|
-
|
|
2822
|
+
__m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
|
|
2823
|
+
4, 5, 2, 3, 0, 1);
|
|
2144
2824
|
|
|
2145
2825
|
Packet16bf res;
|
|
2146
2826
|
// Swap hi and lo first because shuffle is in 128-bit lanes.
|
|
@@ -2150,40 +2830,37 @@ EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
|
|
|
2150
2830
|
}
|
|
2151
2831
|
|
|
2152
2832
|
template <>
|
|
2153
|
-
EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
|
|
2154
|
-
Index stride) {
|
|
2833
|
+
EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from, Index stride) {
|
|
2155
2834
|
return _mm256_set_epi16(
|
|
2156
|
-
from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
|
|
2157
|
-
from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
|
|
2158
|
-
from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
|
|
2159
|
-
from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
|
|
2835
|
+
from[15 * stride].value, from[14 * stride].value, from[13 * stride].value, from[12 * stride].value,
|
|
2836
|
+
from[11 * stride].value, from[10 * stride].value, from[9 * stride].value, from[8 * stride].value,
|
|
2837
|
+
from[7 * stride].value, from[6 * stride].value, from[5 * stride].value, from[4 * stride].value,
|
|
2838
|
+
from[3 * stride].value, from[2 * stride].value, from[1 * stride].value, from[0 * stride].value);
|
|
2160
2839
|
}
|
|
2161
2840
|
|
|
2162
2841
|
template <>
|
|
2163
|
-
EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
|
|
2164
|
-
const Packet16bf& from,
|
|
2165
|
-
Index stride) {
|
|
2842
|
+
EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to, const Packet16bf& from, Index stride) {
|
|
2166
2843
|
EIGEN_ALIGN64 bfloat16 aux[16];
|
|
2167
2844
|
pstore(aux, from);
|
|
2168
|
-
to[stride*0] = aux[0];
|
|
2169
|
-
to[stride*1] = aux[1];
|
|
2170
|
-
to[stride*2] = aux[2];
|
|
2171
|
-
to[stride*3] = aux[3];
|
|
2172
|
-
to[stride*4] = aux[4];
|
|
2173
|
-
to[stride*5] = aux[5];
|
|
2174
|
-
to[stride*6] = aux[6];
|
|
2175
|
-
to[stride*7] = aux[7];
|
|
2176
|
-
to[stride*8] = aux[8];
|
|
2177
|
-
to[stride*9] = aux[9];
|
|
2178
|
-
to[stride*10] = aux[10];
|
|
2179
|
-
to[stride*11] = aux[11];
|
|
2180
|
-
to[stride*12] = aux[12];
|
|
2181
|
-
to[stride*13] = aux[13];
|
|
2182
|
-
to[stride*14] = aux[14];
|
|
2183
|
-
to[stride*15] = aux[15];
|
|
2184
|
-
}
|
|
2185
|
-
|
|
2186
|
-
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
|
|
2845
|
+
to[stride * 0] = aux[0];
|
|
2846
|
+
to[stride * 1] = aux[1];
|
|
2847
|
+
to[stride * 2] = aux[2];
|
|
2848
|
+
to[stride * 3] = aux[3];
|
|
2849
|
+
to[stride * 4] = aux[4];
|
|
2850
|
+
to[stride * 5] = aux[5];
|
|
2851
|
+
to[stride * 6] = aux[6];
|
|
2852
|
+
to[stride * 7] = aux[7];
|
|
2853
|
+
to[stride * 8] = aux[8];
|
|
2854
|
+
to[stride * 9] = aux[9];
|
|
2855
|
+
to[stride * 10] = aux[10];
|
|
2856
|
+
to[stride * 11] = aux[11];
|
|
2857
|
+
to[stride * 12] = aux[12];
|
|
2858
|
+
to[stride * 13] = aux[13];
|
|
2859
|
+
to[stride * 14] = aux[14];
|
|
2860
|
+
to[stride * 15] = aux[15];
|
|
2861
|
+
}
|
|
2862
|
+
|
|
2863
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 16>& kernel) {
|
|
2187
2864
|
__m256i a = kernel.packet[0];
|
|
2188
2865
|
__m256i b = kernel.packet[1];
|
|
2189
2866
|
__m256i c = kernel.packet[2];
|
|
@@ -2273,7 +2950,7 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
|
|
|
2273
2950
|
kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
|
|
2274
2951
|
}
|
|
2275
2952
|
|
|
2276
|
-
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
|
|
2953
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
|
|
2277
2954
|
__m256i a = kernel.packet[0];
|
|
2278
2955
|
__m256i b = kernel.packet[1];
|
|
2279
2956
|
__m256i c = kernel.packet[2];
|
|
@@ -2296,8 +2973,174 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
|
|
|
2296
2973
|
kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
|
|
2297
2974
|
}
|
|
2298
2975
|
|
|
2299
|
-
|
|
2976
|
+
// Minimal implementation of 16-bit int packets for use in pfrexp, pldexp.
|
|
2977
|
+
|
|
2978
|
+
template <>
|
|
2979
|
+
EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) {
|
|
2980
|
+
return _mm512_set1_epi16(x);
|
|
2981
|
+
}
|
|
2982
|
+
|
|
2983
|
+
template <>
|
|
2984
|
+
EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) {
|
|
2985
|
+
return _mm256_set1_epi16(x);
|
|
2986
|
+
}
|
|
2987
|
+
|
|
2988
|
+
template <>
|
|
2989
|
+
EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) {
|
|
2990
|
+
return _mm_set1_epi16(x);
|
|
2991
|
+
}
|
|
2992
|
+
|
|
2993
|
+
template <>
|
|
2994
|
+
EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
|
|
2995
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
2996
|
+
_mm512_store_epi32(out, x);
|
|
2997
|
+
}
|
|
2998
|
+
|
|
2999
|
+
template <>
|
|
3000
|
+
EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
|
|
3001
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
3002
|
+
#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
|
|
3003
|
+
_mm256_store_epi32(out, x);
|
|
3004
|
+
#else
|
|
3005
|
+
_mm256_store_si256(reinterpret_cast<__m256i*>(out), x);
|
|
3006
|
+
#endif
|
|
3007
|
+
}
|
|
3008
|
+
|
|
3009
|
+
template <>
|
|
3010
|
+
EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
|
|
3011
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
3012
|
+
#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
|
|
3013
|
+
_mm256_store_epi32(out, x);
|
|
3014
|
+
#else
|
|
3015
|
+
_mm_store_si128(reinterpret_cast<__m128i*>(out), x);
|
|
3016
|
+
#endif
|
|
3017
|
+
}
|
|
3018
|
+
|
|
3019
|
+
template <>
|
|
3020
|
+
EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
|
|
3021
|
+
EIGEN_DEBUG_UNALIGNED_STORE
|
|
3022
|
+
_mm512_storeu_epi32(out, x);
|
|
3023
|
+
}
|
|
3024
|
+
|
|
3025
|
+
template <>
|
|
3026
|
+
EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
|
|
3027
|
+
EIGEN_DEBUG_UNALIGNED_STORE
|
|
3028
|
+
_mm256_storeu_epi32(out, x);
|
|
3029
|
+
}
|
|
3030
|
+
|
|
3031
|
+
template <>
|
|
3032
|
+
EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
|
|
3033
|
+
EIGEN_DEBUG_UNALIGNED_STORE
|
|
3034
|
+
_mm_storeu_epi32(out, x);
|
|
3035
|
+
}
|
|
3036
|
+
|
|
3037
|
+
template <>
|
|
3038
|
+
EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) {
|
|
3039
|
+
return _mm512_add_epi16(a, b);
|
|
3040
|
+
}
|
|
3041
|
+
|
|
3042
|
+
template <>
|
|
3043
|
+
EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) {
|
|
3044
|
+
return _mm256_add_epi16(a, b);
|
|
3045
|
+
}
|
|
3046
|
+
|
|
3047
|
+
template <>
|
|
3048
|
+
EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) {
|
|
3049
|
+
return _mm_add_epi16(a, b);
|
|
3050
|
+
}
|
|
3051
|
+
|
|
3052
|
+
template <>
|
|
3053
|
+
EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) {
|
|
3054
|
+
return _mm512_sub_epi16(a, b);
|
|
3055
|
+
}
|
|
3056
|
+
|
|
3057
|
+
template <>
|
|
3058
|
+
EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) {
|
|
3059
|
+
return _mm256_sub_epi16(a, b);
|
|
3060
|
+
}
|
|
3061
|
+
|
|
3062
|
+
template <>
|
|
3063
|
+
EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) {
|
|
3064
|
+
return _mm_sub_epi16(a, b);
|
|
3065
|
+
}
|
|
3066
|
+
|
|
3067
|
+
template <>
|
|
3068
|
+
EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) {
|
|
3069
|
+
return _mm512_mullo_epi16(a, b);
|
|
3070
|
+
}
|
|
3071
|
+
|
|
3072
|
+
template <>
|
|
3073
|
+
EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) {
|
|
3074
|
+
return _mm256_mullo_epi16(a, b);
|
|
3075
|
+
}
|
|
3076
|
+
|
|
3077
|
+
template <>
|
|
3078
|
+
EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) {
|
|
3079
|
+
return _mm_mullo_epi16(a, b);
|
|
3080
|
+
}
|
|
3081
|
+
|
|
3082
|
+
template <>
|
|
3083
|
+
EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) {
|
|
3084
|
+
return _mm512_sub_epi16(_mm512_setzero_si512(), a);
|
|
3085
|
+
}
|
|
3086
|
+
|
|
3087
|
+
template <>
|
|
3088
|
+
EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) {
|
|
3089
|
+
return _mm256_sub_epi16(_mm256_setzero_si256(), a);
|
|
3090
|
+
}
|
|
3091
|
+
|
|
3092
|
+
template <>
|
|
3093
|
+
EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
|
|
3094
|
+
return _mm_sub_epi16(_mm_setzero_si128(), a);
|
|
3095
|
+
}
|
|
3096
|
+
|
|
3097
|
+
template <int N>
|
|
3098
|
+
EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
|
|
3099
|
+
return _mm512_srai_epi16(a, N);
|
|
3100
|
+
}
|
|
3101
|
+
|
|
3102
|
+
template <int N>
|
|
3103
|
+
EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
|
|
3104
|
+
return _mm256_srai_epi16(a, N);
|
|
3105
|
+
}
|
|
3106
|
+
|
|
3107
|
+
template <int N>
|
|
3108
|
+
EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
|
|
3109
|
+
return _mm_srai_epi16(a, N);
|
|
3110
|
+
}
|
|
3111
|
+
|
|
3112
|
+
template <int N>
|
|
3113
|
+
EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
|
|
3114
|
+
return _mm512_slli_epi16(a, N);
|
|
3115
|
+
}
|
|
3116
|
+
|
|
3117
|
+
template <int N>
|
|
3118
|
+
EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
|
|
3119
|
+
return _mm256_slli_epi16(a, N);
|
|
3120
|
+
}
|
|
3121
|
+
|
|
3122
|
+
template <int N>
|
|
3123
|
+
EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
|
|
3124
|
+
return _mm_slli_epi16(a, N);
|
|
3125
|
+
}
|
|
3126
|
+
|
|
3127
|
+
template <int N>
|
|
3128
|
+
EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
|
|
3129
|
+
return _mm512_srli_epi16(a, N);
|
|
3130
|
+
}
|
|
3131
|
+
|
|
3132
|
+
template <int N>
|
|
3133
|
+
EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
|
|
3134
|
+
return _mm256_srli_epi16(a, N);
|
|
3135
|
+
}
|
|
3136
|
+
|
|
3137
|
+
template <int N>
|
|
3138
|
+
EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
|
|
3139
|
+
return _mm_srli_epi16(a, N);
|
|
3140
|
+
}
|
|
3141
|
+
|
|
3142
|
+
} // end namespace internal
|
|
2300
3143
|
|
|
2301
|
-
}
|
|
3144
|
+
} // end namespace Eigen
|
|
2302
3145
|
|
|
2303
|
-
#endif
|
|
3146
|
+
#endif // EIGEN_PACKET_MATH_AVX512_H
|