@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -10,6 +10,9 @@
|
|
|
10
10
|
#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
|
|
11
11
|
#define EIGEN_PACKET_MATH_ALTIVEC_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
@@ -24,127 +27,137 @@ namespace internal {
|
|
|
24
27
|
|
|
25
28
|
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
|
|
26
29
|
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
27
|
-
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
30
|
+
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
|
28
31
|
#endif
|
|
29
32
|
|
|
30
|
-
typedef __vector float
|
|
31
|
-
typedef __vector int
|
|
32
|
-
typedef __vector unsigned int
|
|
33
|
-
typedef __vector __bool int
|
|
34
|
-
typedef __vector short int
|
|
35
|
-
typedef __vector unsigned short int
|
|
36
|
-
typedef __vector
|
|
37
|
-
typedef __vector
|
|
38
|
-
typedef
|
|
33
|
+
typedef __vector float Packet4f;
|
|
34
|
+
typedef __vector int Packet4i;
|
|
35
|
+
typedef __vector unsigned int Packet4ui;
|
|
36
|
+
typedef __vector __bool int Packet4bi;
|
|
37
|
+
typedef __vector short int Packet8s;
|
|
38
|
+
typedef __vector unsigned short int Packet8us;
|
|
39
|
+
typedef __vector __bool short Packet8bi;
|
|
40
|
+
typedef __vector signed char Packet16c;
|
|
41
|
+
typedef __vector unsigned char Packet16uc;
|
|
42
|
+
typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
|
|
39
43
|
|
|
40
44
|
// We don't want to write the same code all the time, but we need to reuse the constants
|
|
41
45
|
// and it doesn't really work to declare them global, so we define macros instead
|
|
42
|
-
#define
|
|
43
|
-
Packet4f p4f_##NAME = {X, X, X, X}
|
|
46
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
|
|
44
47
|
|
|
45
|
-
#define
|
|
46
|
-
Packet4i p4i_##NAME = vec_splat_s32(X)
|
|
48
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
|
|
47
49
|
|
|
48
|
-
#define
|
|
49
|
-
Packet4ui p4ui_##NAME = {X, X, X, X}
|
|
50
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
|
|
50
51
|
|
|
51
|
-
#define
|
|
52
|
-
Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
|
|
52
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
|
|
53
53
|
|
|
54
|
-
#define
|
|
54
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
|
|
55
55
|
Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
|
|
56
56
|
|
|
57
|
-
#define
|
|
58
|
-
Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
|
57
|
+
#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
|
59
58
|
|
|
60
|
-
#define
|
|
61
|
-
Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
|
59
|
+
#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
|
62
60
|
|
|
63
|
-
#define
|
|
64
|
-
Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
|
61
|
+
#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
|
65
62
|
|
|
66
|
-
#define
|
|
67
|
-
Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
|
63
|
+
#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
|
68
64
|
|
|
69
|
-
#define
|
|
65
|
+
#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
|
|
70
66
|
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
|
|
71
67
|
|
|
72
68
|
#define DST_CHAN 1
|
|
73
69
|
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
|
|
74
|
-
#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
|
|
70
|
+
#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
|
|
75
71
|
|
|
76
72
|
// These constants are endian-agnostic
|
|
77
|
-
static
|
|
78
|
-
static
|
|
79
|
-
static
|
|
80
|
-
static
|
|
81
|
-
static
|
|
82
|
-
static
|
|
83
|
-
static
|
|
84
|
-
static
|
|
85
|
-
static
|
|
86
|
-
|
|
73
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
|
|
74
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
|
|
75
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
|
|
76
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
|
|
77
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
|
|
78
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
|
|
79
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
|
|
80
|
+
static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
|
|
81
|
+
static Packet4f p4f_MZERO =
|
|
82
|
+
(Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
|
|
87
83
|
#ifndef __VSX__
|
|
88
|
-
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
|
|
84
|
+
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
|
|
89
85
|
#endif
|
|
90
86
|
|
|
91
|
-
static Packet4f
|
|
92
|
-
static Packet4i
|
|
93
|
-
static Packet8s
|
|
94
|
-
static Packet8us p8us_COUNTDOWN = {
|
|
87
|
+
static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
|
|
88
|
+
static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
|
|
89
|
+
static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
|
|
90
|
+
static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
|
|
91
|
+
|
|
92
|
+
static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
93
|
+
static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
95
94
|
|
|
96
|
-
static
|
|
97
|
-
|
|
98
|
-
static Packet16uc
|
|
99
|
-
8, 9, 10, 11, 12, 13, 14, 15};
|
|
95
|
+
static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
|
|
96
|
+
static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
|
|
97
|
+
static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
|
|
100
98
|
|
|
101
|
-
|
|
102
|
-
static Packet16uc
|
|
103
|
-
|
|
99
|
+
#ifdef _BIG_ENDIAN
|
|
100
|
+
static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
|
|
101
|
+
#endif
|
|
102
|
+
static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
|
|
103
|
+
static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
|
|
104
104
|
|
|
105
|
-
static Packet16uc
|
|
106
|
-
static Packet16uc
|
|
107
|
-
static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
|
|
108
|
-
static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
|
|
109
|
-
static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
|
|
105
|
+
static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
|
|
106
|
+
static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
|
|
110
107
|
|
|
111
|
-
static Packet16uc
|
|
108
|
+
static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
|
|
109
|
+
static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
|
|
110
|
+
#ifdef _BIG_ENDIAN
|
|
111
|
+
static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
|
|
112
|
+
#else
|
|
113
|
+
static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
|
|
114
|
+
#endif
|
|
112
115
|
|
|
113
116
|
// Handle endianness properly while loading constants
|
|
114
117
|
// Define global static constants:
|
|
115
118
|
#ifdef _BIG_ENDIAN
|
|
116
119
|
static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
static Packet16uc
|
|
121
|
-
|
|
122
|
-
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)
|
|
120
|
+
static Packet16uc p16uc_PSET32_WODD =
|
|
121
|
+
vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
|
|
122
|
+
8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
|
123
|
+
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
|
|
124
|
+
8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
|
125
|
+
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
|
|
126
|
+
8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
|
123
127
|
#else
|
|
124
128
|
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
|
|
125
|
-
static Packet16uc
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
static Packet16uc
|
|
129
|
-
|
|
129
|
+
static Packet16uc p16uc_PSET32_WODD =
|
|
130
|
+
vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
|
|
131
|
+
8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
|
132
|
+
static Packet16uc p16uc_PSET32_WEVEN =
|
|
133
|
+
vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
|
|
134
|
+
8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
|
135
|
+
static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
|
|
136
|
+
8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
|
137
|
+
#endif // _BIG_ENDIAN
|
|
138
|
+
|
|
139
|
+
static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
|
|
140
|
+
(Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
|
|
141
|
+
static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
|
|
142
|
+
(Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
|
|
143
|
+
static Packet16uc p16uc_TRANSPOSE64_HI =
|
|
144
|
+
p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
|
145
|
+
static Packet16uc p16uc_TRANSPOSE64_LO =
|
|
146
|
+
p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
|
|
147
|
+
|
|
148
|
+
static Packet16uc p16uc_COMPLEX32_REV =
|
|
149
|
+
vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
|
|
130
150
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
|
134
|
-
static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
|
|
135
|
-
|
|
136
|
-
static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
|
|
137
|
-
|
|
138
|
-
#ifdef _BIG_ENDIAN
|
|
139
|
-
static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
|
151
|
+
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
|
152
|
+
#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
|
|
140
153
|
#else
|
|
141
|
-
|
|
142
|
-
#endif
|
|
154
|
+
#define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
|
|
155
|
+
#endif
|
|
143
156
|
|
|
144
|
-
#if
|
|
145
|
-
|
|
157
|
+
#if EIGEN_COMP_LLVM
|
|
158
|
+
#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
|
|
146
159
|
#else
|
|
147
|
-
|
|
160
|
+
#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
|
|
148
161
|
#endif
|
|
149
162
|
|
|
150
163
|
template <>
|
|
@@ -155,7 +168,6 @@ struct packet_traits<float> : default_packet_traits {
|
|
|
155
168
|
Vectorizable = 1,
|
|
156
169
|
AlignedOnScalar = 1,
|
|
157
170
|
size = 4,
|
|
158
|
-
HasHalfPacket = 1,
|
|
159
171
|
|
|
160
172
|
HasAdd = 1,
|
|
161
173
|
HasSub = 1,
|
|
@@ -166,25 +178,31 @@ struct packet_traits<float> : default_packet_traits {
|
|
|
166
178
|
HasAbs = 1,
|
|
167
179
|
HasSin = EIGEN_FAST_MATH,
|
|
168
180
|
HasCos = EIGEN_FAST_MATH,
|
|
181
|
+
HasACos = 1,
|
|
182
|
+
HasASin = 1,
|
|
183
|
+
HasATan = 1,
|
|
184
|
+
HasATanh = 1,
|
|
169
185
|
HasLog = 1,
|
|
170
186
|
HasExp = 1,
|
|
171
|
-
#ifdef
|
|
187
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
188
|
+
HasCmp = 1,
|
|
189
|
+
HasPow = 1,
|
|
172
190
|
HasSqrt = 1,
|
|
191
|
+
HasCbrt = 1,
|
|
173
192
|
#if !EIGEN_COMP_CLANG
|
|
174
193
|
HasRsqrt = 1,
|
|
175
194
|
#else
|
|
176
195
|
HasRsqrt = 0,
|
|
177
196
|
#endif
|
|
197
|
+
HasTanh = EIGEN_FAST_MATH,
|
|
198
|
+
HasErf = EIGEN_FAST_MATH,
|
|
199
|
+
HasErfc = EIGEN_FAST_MATH,
|
|
178
200
|
#else
|
|
179
201
|
HasSqrt = 0,
|
|
180
202
|
HasRsqrt = 0,
|
|
181
|
-
HasTanh =
|
|
182
|
-
HasErf =
|
|
203
|
+
HasTanh = 0,
|
|
204
|
+
HasErf = 0,
|
|
183
205
|
#endif
|
|
184
|
-
HasRound = 1,
|
|
185
|
-
HasFloor = 1,
|
|
186
|
-
HasCeil = 1,
|
|
187
|
-
HasRint = 1,
|
|
188
206
|
HasNegate = 1,
|
|
189
207
|
HasBlend = 1
|
|
190
208
|
};
|
|
@@ -197,7 +215,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
|
|
|
197
215
|
Vectorizable = 1,
|
|
198
216
|
AlignedOnScalar = 1,
|
|
199
217
|
size = 8,
|
|
200
|
-
HasHalfPacket = 0,
|
|
201
218
|
|
|
202
219
|
HasAdd = 1,
|
|
203
220
|
HasSub = 1,
|
|
@@ -210,7 +227,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
|
|
|
210
227
|
HasCos = EIGEN_FAST_MATH,
|
|
211
228
|
HasLog = 1,
|
|
212
229
|
HasExp = 1,
|
|
213
|
-
#ifdef
|
|
230
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
214
231
|
HasSqrt = 1,
|
|
215
232
|
#if !EIGEN_COMP_CLANG
|
|
216
233
|
HasRsqrt = 1,
|
|
@@ -220,13 +237,9 @@ struct packet_traits<bfloat16> : default_packet_traits {
|
|
|
220
237
|
#else
|
|
221
238
|
HasSqrt = 0,
|
|
222
239
|
HasRsqrt = 0,
|
|
223
|
-
HasTanh = EIGEN_FAST_MATH,
|
|
224
|
-
HasErf = EIGEN_FAST_MATH,
|
|
225
240
|
#endif
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
HasCeil = 1,
|
|
229
|
-
HasRint = 1,
|
|
241
|
+
HasTanh = 0,
|
|
242
|
+
HasErf = 0,
|
|
230
243
|
HasNegate = 1,
|
|
231
244
|
HasBlend = 1
|
|
232
245
|
};
|
|
@@ -240,14 +253,18 @@ struct packet_traits<int> : default_packet_traits {
|
|
|
240
253
|
Vectorizable = 1,
|
|
241
254
|
AlignedOnScalar = 1,
|
|
242
255
|
size = 4,
|
|
243
|
-
HasHalfPacket = 0,
|
|
244
256
|
|
|
245
|
-
HasAdd
|
|
246
|
-
HasSub
|
|
257
|
+
HasAdd = 1,
|
|
258
|
+
HasSub = 1,
|
|
247
259
|
HasShift = 1,
|
|
248
|
-
HasMul
|
|
249
|
-
|
|
250
|
-
|
|
260
|
+
HasMul = 1,
|
|
261
|
+
#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
|
|
262
|
+
HasDiv = 1,
|
|
263
|
+
#else
|
|
264
|
+
HasDiv = 0,
|
|
265
|
+
#endif
|
|
266
|
+
HasBlend = 1,
|
|
267
|
+
HasCmp = 1
|
|
251
268
|
};
|
|
252
269
|
};
|
|
253
270
|
|
|
@@ -259,13 +276,13 @@ struct packet_traits<short int> : default_packet_traits {
|
|
|
259
276
|
Vectorizable = 1,
|
|
260
277
|
AlignedOnScalar = 1,
|
|
261
278
|
size = 8,
|
|
262
|
-
HasHalfPacket = 0,
|
|
263
279
|
|
|
264
|
-
HasAdd
|
|
265
|
-
HasSub
|
|
266
|
-
HasMul
|
|
267
|
-
HasDiv
|
|
268
|
-
HasBlend = 1
|
|
280
|
+
HasAdd = 1,
|
|
281
|
+
HasSub = 1,
|
|
282
|
+
HasMul = 1,
|
|
283
|
+
HasDiv = 0,
|
|
284
|
+
HasBlend = 1,
|
|
285
|
+
HasCmp = 1
|
|
269
286
|
};
|
|
270
287
|
};
|
|
271
288
|
|
|
@@ -277,13 +294,13 @@ struct packet_traits<unsigned short int> : default_packet_traits {
|
|
|
277
294
|
Vectorizable = 1,
|
|
278
295
|
AlignedOnScalar = 1,
|
|
279
296
|
size = 8,
|
|
280
|
-
HasHalfPacket = 0,
|
|
281
297
|
|
|
282
|
-
HasAdd
|
|
283
|
-
HasSub
|
|
284
|
-
HasMul
|
|
285
|
-
HasDiv
|
|
286
|
-
HasBlend = 1
|
|
298
|
+
HasAdd = 1,
|
|
299
|
+
HasSub = 1,
|
|
300
|
+
HasMul = 1,
|
|
301
|
+
HasDiv = 0,
|
|
302
|
+
HasBlend = 1,
|
|
303
|
+
HasCmp = 1
|
|
287
304
|
};
|
|
288
305
|
};
|
|
289
306
|
|
|
@@ -295,13 +312,13 @@ struct packet_traits<signed char> : default_packet_traits {
|
|
|
295
312
|
Vectorizable = 1,
|
|
296
313
|
AlignedOnScalar = 1,
|
|
297
314
|
size = 16,
|
|
298
|
-
HasHalfPacket = 0,
|
|
299
315
|
|
|
300
|
-
HasAdd
|
|
301
|
-
HasSub
|
|
302
|
-
HasMul
|
|
303
|
-
HasDiv
|
|
304
|
-
HasBlend = 1
|
|
316
|
+
HasAdd = 1,
|
|
317
|
+
HasSub = 1,
|
|
318
|
+
HasMul = 1,
|
|
319
|
+
HasDiv = 0,
|
|
320
|
+
HasBlend = 1,
|
|
321
|
+
HasCmp = 1
|
|
305
322
|
};
|
|
306
323
|
};
|
|
307
324
|
|
|
@@ -313,273 +330,442 @@ struct packet_traits<unsigned char> : default_packet_traits {
|
|
|
313
330
|
Vectorizable = 1,
|
|
314
331
|
AlignedOnScalar = 1,
|
|
315
332
|
size = 16,
|
|
316
|
-
HasHalfPacket = 0,
|
|
317
333
|
|
|
318
|
-
HasAdd
|
|
319
|
-
HasSub
|
|
320
|
-
HasMul
|
|
321
|
-
HasDiv
|
|
322
|
-
HasBlend = 1
|
|
334
|
+
HasAdd = 1,
|
|
335
|
+
HasSub = 1,
|
|
336
|
+
HasMul = 1,
|
|
337
|
+
HasDiv = 0,
|
|
338
|
+
HasBlend = 1,
|
|
339
|
+
HasCmp = 1
|
|
323
340
|
};
|
|
324
341
|
};
|
|
325
342
|
|
|
326
|
-
template<>
|
|
327
|
-
{
|
|
328
|
-
typedef float
|
|
329
|
-
typedef Packet4f
|
|
330
|
-
typedef Packet4i
|
|
331
|
-
enum {
|
|
343
|
+
template <>
|
|
344
|
+
struct unpacket_traits<Packet4f> {
|
|
345
|
+
typedef float type;
|
|
346
|
+
typedef Packet4f half;
|
|
347
|
+
typedef Packet4i integer_packet;
|
|
348
|
+
enum {
|
|
349
|
+
size = 4,
|
|
350
|
+
alignment = Aligned16,
|
|
351
|
+
vectorizable = true,
|
|
352
|
+
masked_load_available = false,
|
|
353
|
+
masked_store_available = false
|
|
354
|
+
};
|
|
332
355
|
};
|
|
333
|
-
template<>
|
|
334
|
-
{
|
|
335
|
-
typedef int
|
|
336
|
-
typedef Packet4i
|
|
337
|
-
enum {
|
|
356
|
+
template <>
|
|
357
|
+
struct unpacket_traits<Packet4i> {
|
|
358
|
+
typedef int type;
|
|
359
|
+
typedef Packet4i half;
|
|
360
|
+
enum {
|
|
361
|
+
size = 4,
|
|
362
|
+
alignment = Aligned16,
|
|
363
|
+
vectorizable = true,
|
|
364
|
+
masked_load_available = false,
|
|
365
|
+
masked_store_available = false
|
|
366
|
+
};
|
|
338
367
|
};
|
|
339
|
-
template<>
|
|
340
|
-
{
|
|
368
|
+
template <>
|
|
369
|
+
struct unpacket_traits<Packet8s> {
|
|
341
370
|
typedef short int type;
|
|
342
|
-
typedef Packet8s
|
|
343
|
-
enum {
|
|
371
|
+
typedef Packet8s half;
|
|
372
|
+
enum {
|
|
373
|
+
size = 8,
|
|
374
|
+
alignment = Aligned16,
|
|
375
|
+
vectorizable = true,
|
|
376
|
+
masked_load_available = false,
|
|
377
|
+
masked_store_available = false
|
|
378
|
+
};
|
|
344
379
|
};
|
|
345
|
-
template<>
|
|
346
|
-
{
|
|
380
|
+
template <>
|
|
381
|
+
struct unpacket_traits<Packet8us> {
|
|
347
382
|
typedef unsigned short int type;
|
|
348
|
-
typedef Packet8us
|
|
349
|
-
enum {
|
|
383
|
+
typedef Packet8us half;
|
|
384
|
+
enum {
|
|
385
|
+
size = 8,
|
|
386
|
+
alignment = Aligned16,
|
|
387
|
+
vectorizable = true,
|
|
388
|
+
masked_load_available = false,
|
|
389
|
+
masked_store_available = false
|
|
390
|
+
};
|
|
350
391
|
};
|
|
351
392
|
|
|
352
|
-
template<>
|
|
353
|
-
{
|
|
393
|
+
template <>
|
|
394
|
+
struct unpacket_traits<Packet16c> {
|
|
354
395
|
typedef signed char type;
|
|
355
|
-
typedef Packet16c
|
|
356
|
-
enum {
|
|
396
|
+
typedef Packet16c half;
|
|
397
|
+
enum {
|
|
398
|
+
size = 16,
|
|
399
|
+
alignment = Aligned16,
|
|
400
|
+
vectorizable = true,
|
|
401
|
+
masked_load_available = false,
|
|
402
|
+
masked_store_available = false
|
|
403
|
+
};
|
|
357
404
|
};
|
|
358
|
-
template<>
|
|
359
|
-
{
|
|
405
|
+
template <>
|
|
406
|
+
struct unpacket_traits<Packet16uc> {
|
|
360
407
|
typedef unsigned char type;
|
|
361
|
-
typedef Packet16uc
|
|
362
|
-
enum {
|
|
408
|
+
typedef Packet16uc half;
|
|
409
|
+
enum {
|
|
410
|
+
size = 16,
|
|
411
|
+
alignment = Aligned16,
|
|
412
|
+
vectorizable = true,
|
|
413
|
+
masked_load_available = false,
|
|
414
|
+
masked_store_available = false
|
|
415
|
+
};
|
|
363
416
|
};
|
|
364
417
|
|
|
365
|
-
template<>
|
|
366
|
-
{
|
|
418
|
+
template <>
|
|
419
|
+
struct unpacket_traits<Packet8bf> {
|
|
367
420
|
typedef bfloat16 type;
|
|
368
|
-
typedef Packet8bf
|
|
369
|
-
enum {
|
|
421
|
+
typedef Packet8bf half;
|
|
422
|
+
enum {
|
|
423
|
+
size = 8,
|
|
424
|
+
alignment = Aligned16,
|
|
425
|
+
vectorizable = true,
|
|
426
|
+
masked_load_available = false,
|
|
427
|
+
masked_store_available = false
|
|
428
|
+
};
|
|
370
429
|
};
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
430
|
+
|
|
431
|
+
template <typename Packet>
|
|
432
|
+
EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
|
|
433
|
+
// some versions of GCC throw "unused-but-set-parameter".
|
|
434
|
+
// ignoring these warnings for now.
|
|
435
|
+
EIGEN_UNUSED_VARIABLE(from);
|
|
436
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
|
437
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
438
|
+
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
|
439
|
+
#else
|
|
440
|
+
return vec_ld(0, from);
|
|
441
|
+
#endif
|
|
381
442
|
}
|
|
382
443
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
unsigned char n[16];
|
|
388
|
-
} vt;
|
|
389
|
-
vt.v = v;
|
|
390
|
-
for (int i=0; i< 16; i++)
|
|
391
|
-
s << vt.n[i] << ", ";
|
|
392
|
-
return s;
|
|
444
|
+
// Need to define them first or we get specialization after instantiation errors
|
|
445
|
+
template <>
|
|
446
|
+
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
|
|
447
|
+
return pload_common<Packet4f>(from);
|
|
393
448
|
}
|
|
394
449
|
|
|
395
|
-
|
|
396
|
-
{
|
|
397
|
-
|
|
398
|
-
Packet4f v;
|
|
399
|
-
float n[4];
|
|
400
|
-
} vt;
|
|
401
|
-
vt.v = v;
|
|
402
|
-
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
|
|
403
|
-
return s;
|
|
450
|
+
template <>
|
|
451
|
+
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
|
|
452
|
+
return pload_common<Packet4i>(from);
|
|
404
453
|
}
|
|
405
454
|
|
|
406
|
-
|
|
407
|
-
{
|
|
408
|
-
|
|
409
|
-
Packet4i v;
|
|
410
|
-
int n[4];
|
|
411
|
-
} vt;
|
|
412
|
-
vt.v = v;
|
|
413
|
-
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
|
|
414
|
-
return s;
|
|
455
|
+
template <>
|
|
456
|
+
EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
|
|
457
|
+
return pload_common<Packet8s>(from);
|
|
415
458
|
}
|
|
416
459
|
|
|
417
|
-
|
|
418
|
-
{
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
460
|
+
template <>
|
|
461
|
+
EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
|
|
462
|
+
return pload_common<Packet8us>(from);
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
template <>
|
|
466
|
+
EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
|
|
467
|
+
return pload_common<Packet16c>(from);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
template <>
|
|
471
|
+
EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
|
|
472
|
+
return pload_common<Packet16uc>(from);
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
template <>
|
|
476
|
+
EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
|
|
477
|
+
return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
|
426
478
|
}
|
|
427
479
|
|
|
428
480
|
template <typename Packet>
|
|
429
|
-
|
|
430
|
-
{
|
|
481
|
+
EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
|
|
431
482
|
// some versions of GCC throw "unused-but-set-parameter".
|
|
432
483
|
// ignoring these warnings for now.
|
|
433
484
|
EIGEN_UNUSED_VARIABLE(from);
|
|
434
485
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
435
|
-
|
|
486
|
+
// Ignore partial input memory initialized
|
|
487
|
+
#if !EIGEN_COMP_LLVM
|
|
488
|
+
#pragma GCC diagnostic push
|
|
489
|
+
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
|
490
|
+
#endif
|
|
491
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
436
492
|
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
|
437
493
|
#else
|
|
438
494
|
return vec_ld(0, from);
|
|
439
495
|
#endif
|
|
496
|
+
#if !EIGEN_COMP_LLVM
|
|
497
|
+
#pragma GCC diagnostic pop
|
|
498
|
+
#endif
|
|
440
499
|
}
|
|
441
500
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
return pload_common<Packet4f>(from);
|
|
501
|
+
template <>
|
|
502
|
+
EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
|
|
503
|
+
return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
|
446
504
|
}
|
|
447
505
|
|
|
448
|
-
template
|
|
449
|
-
|
|
450
|
-
|
|
506
|
+
template <typename Packet>
|
|
507
|
+
EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
|
|
508
|
+
const Index offset) {
|
|
509
|
+
// some versions of GCC throw "unused-but-set-parameter".
|
|
510
|
+
// ignoring these warnings for now.
|
|
511
|
+
const Index packet_size = unpacket_traits<Packet>::size;
|
|
512
|
+
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
|
|
513
|
+
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
|
514
|
+
#ifdef _ARCH_PWR9
|
|
515
|
+
EIGEN_UNUSED_VARIABLE(packet_size);
|
|
516
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
|
517
|
+
EIGEN_UNUSED_VARIABLE(from);
|
|
518
|
+
Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
|
|
519
|
+
if (offset) {
|
|
520
|
+
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
|
|
521
|
+
#ifdef _BIG_ENDIAN
|
|
522
|
+
load = Packet(vec_sro(Packet16uc(load), shift));
|
|
523
|
+
#else
|
|
524
|
+
load = Packet(vec_slo(Packet16uc(load), shift));
|
|
525
|
+
#endif
|
|
526
|
+
}
|
|
527
|
+
return load;
|
|
528
|
+
#else
|
|
529
|
+
if (n) {
|
|
530
|
+
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
|
|
531
|
+
unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
|
|
532
|
+
unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
|
533
|
+
Index n2 = n * size;
|
|
534
|
+
if (16 <= n2) {
|
|
535
|
+
pstoreu(load2, ploadu<Packet16uc>(from2));
|
|
536
|
+
} else {
|
|
537
|
+
memcpy((void*)load2, (void*)from2, n2);
|
|
538
|
+
}
|
|
539
|
+
return pload_ignore<Packet>(load);
|
|
540
|
+
} else {
|
|
541
|
+
return Packet(pset1<Packet16uc>(0));
|
|
542
|
+
}
|
|
543
|
+
#endif
|
|
451
544
|
}
|
|
452
545
|
|
|
453
|
-
template<>
|
|
454
|
-
{
|
|
455
|
-
return
|
|
546
|
+
template <>
|
|
547
|
+
EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
|
|
548
|
+
return pload_partial_common<Packet4f>(from, n, offset);
|
|
456
549
|
}
|
|
457
550
|
|
|
458
|
-
template<>
|
|
459
|
-
{
|
|
460
|
-
return
|
|
551
|
+
template <>
|
|
552
|
+
EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
|
|
553
|
+
return pload_partial_common<Packet4i>(from, n, offset);
|
|
461
554
|
}
|
|
462
555
|
|
|
463
|
-
template<>
|
|
464
|
-
{
|
|
465
|
-
return
|
|
556
|
+
template <>
|
|
557
|
+
EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
|
|
558
|
+
return pload_partial_common<Packet8s>(from, n, offset);
|
|
466
559
|
}
|
|
467
560
|
|
|
468
|
-
template<>
|
|
469
|
-
|
|
470
|
-
|
|
561
|
+
template <>
|
|
562
|
+
EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
|
|
563
|
+
const Index offset) {
|
|
564
|
+
return pload_partial_common<Packet8us>(from, n, offset);
|
|
471
565
|
}
|
|
472
566
|
|
|
473
|
-
template<>
|
|
474
|
-
{
|
|
475
|
-
return
|
|
567
|
+
template <>
|
|
568
|
+
EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
|
|
569
|
+
return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
template <>
|
|
573
|
+
EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
|
|
574
|
+
return pload_partial_common<Packet16c>(from, n, offset);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
template <>
|
|
578
|
+
EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
|
|
579
|
+
return pload_partial_common<Packet16uc>(from, n, offset);
|
|
476
580
|
}
|
|
477
581
|
|
|
478
582
|
template <typename Packet>
|
|
479
|
-
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
|
|
583
|
+
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
|
|
480
584
|
// some versions of GCC throw "unused-but-set-parameter" (float *to).
|
|
481
585
|
// ignoring these warnings for now.
|
|
482
586
|
EIGEN_UNUSED_VARIABLE(to);
|
|
483
587
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
484
|
-
#ifdef
|
|
588
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
485
589
|
vec_xst(from, 0, to);
|
|
486
590
|
#else
|
|
487
591
|
vec_st(from, 0, to);
|
|
488
592
|
#endif
|
|
489
593
|
}
|
|
490
594
|
|
|
491
|
-
template<>
|
|
492
|
-
{
|
|
595
|
+
template <>
|
|
596
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
|
|
493
597
|
pstore_common<Packet4f>(to, from);
|
|
494
598
|
}
|
|
495
599
|
|
|
496
|
-
template<>
|
|
497
|
-
{
|
|
600
|
+
template <>
|
|
601
|
+
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
|
|
498
602
|
pstore_common<Packet4i>(to, from);
|
|
499
603
|
}
|
|
500
604
|
|
|
501
|
-
template<>
|
|
502
|
-
{
|
|
605
|
+
template <>
|
|
606
|
+
EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
|
|
503
607
|
pstore_common<Packet8s>(to, from);
|
|
504
608
|
}
|
|
505
609
|
|
|
506
|
-
template<>
|
|
507
|
-
{
|
|
610
|
+
template <>
|
|
611
|
+
EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
|
|
508
612
|
pstore_common<Packet8us>(to, from);
|
|
509
613
|
}
|
|
510
614
|
|
|
511
|
-
template<>
|
|
512
|
-
{
|
|
513
|
-
pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
|
|
615
|
+
template <>
|
|
616
|
+
EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
|
|
617
|
+
pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
|
|
514
618
|
}
|
|
515
619
|
|
|
516
|
-
template<>
|
|
517
|
-
{
|
|
620
|
+
template <>
|
|
621
|
+
EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
|
|
518
622
|
pstore_common<Packet16c>(to, from);
|
|
519
623
|
}
|
|
520
624
|
|
|
521
|
-
template<>
|
|
522
|
-
{
|
|
625
|
+
template <>
|
|
626
|
+
EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
|
|
523
627
|
pstore_common<Packet16uc>(to, from);
|
|
524
628
|
}
|
|
525
629
|
|
|
526
|
-
template<typename Packet>
|
|
527
|
-
|
|
528
|
-
{
|
|
630
|
+
template <typename Packet>
|
|
631
|
+
EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
|
|
632
|
+
const Index offset) {
|
|
633
|
+
// some versions of GCC throw "unused-but-set-parameter" (float *to).
|
|
634
|
+
// ignoring these warnings for now.
|
|
635
|
+
const Index packet_size = unpacket_traits<Packet>::size;
|
|
636
|
+
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
|
|
637
|
+
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
|
638
|
+
#ifdef _ARCH_PWR9
|
|
639
|
+
EIGEN_UNUSED_VARIABLE(packet_size);
|
|
640
|
+
EIGEN_UNUSED_VARIABLE(to);
|
|
641
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
642
|
+
Packet store = from;
|
|
643
|
+
if (offset) {
|
|
644
|
+
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
|
|
645
|
+
#ifdef _BIG_ENDIAN
|
|
646
|
+
store = Packet(vec_slo(Packet16uc(store), shift));
|
|
647
|
+
#else
|
|
648
|
+
store = Packet(vec_sro(Packet16uc(store), shift));
|
|
649
|
+
#endif
|
|
650
|
+
}
|
|
651
|
+
vec_xst_len(store, to, n * size);
|
|
652
|
+
#else
|
|
653
|
+
if (n) {
|
|
654
|
+
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
|
|
655
|
+
pstore(store, from);
|
|
656
|
+
unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
|
|
657
|
+
unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
|
|
658
|
+
Index n2 = n * size;
|
|
659
|
+
if (16 <= n2) {
|
|
660
|
+
pstore(to2, ploadu<Packet16uc>(store2));
|
|
661
|
+
} else {
|
|
662
|
+
memcpy((void*)to2, (void*)store2, n2);
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
#endif
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
template <>
|
|
669
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
|
|
670
|
+
pstore_partial_common<Packet4f>(to, from, n, offset);
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
template <>
|
|
674
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
|
|
675
|
+
pstore_partial_common<Packet4i>(to, from, n, offset);
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
template <>
|
|
679
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
|
|
680
|
+
const Index offset) {
|
|
681
|
+
pstore_partial_common<Packet8s>(to, from, n, offset);
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
template <>
|
|
685
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
|
|
686
|
+
const Index n, const Index offset) {
|
|
687
|
+
pstore_partial_common<Packet8us>(to, from, n, offset);
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
template <>
|
|
691
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
|
|
692
|
+
const Index offset) {
|
|
693
|
+
pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
template <>
|
|
697
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
|
|
698
|
+
const Index offset) {
|
|
699
|
+
pstore_partial_common<Packet16c>(to, from, n, offset);
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
template <>
|
|
703
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
|
|
704
|
+
const Index offset) {
|
|
705
|
+
pstore_partial_common<Packet16uc>(to, from, n, offset);
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
template <typename Packet>
|
|
709
|
+
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
|
|
529
710
|
Packet v = {from, from, from, from};
|
|
530
711
|
return v;
|
|
531
712
|
}
|
|
532
713
|
|
|
533
|
-
template<typename Packet>
|
|
534
|
-
EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
|
|
535
|
-
{
|
|
714
|
+
template <typename Packet>
|
|
715
|
+
EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
|
|
536
716
|
Packet v = {from, from, from, from, from, from, from, from};
|
|
537
717
|
return v;
|
|
538
718
|
}
|
|
539
719
|
|
|
540
|
-
template<typename Packet>
|
|
541
|
-
EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
|
|
542
|
-
{
|
|
720
|
+
template <typename Packet>
|
|
721
|
+
EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
|
|
543
722
|
Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
|
|
544
723
|
return v;
|
|
545
724
|
}
|
|
546
725
|
|
|
547
|
-
template<>
|
|
726
|
+
template <>
|
|
727
|
+
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
|
548
728
|
return pset1_size4<Packet4f>(from);
|
|
549
729
|
}
|
|
550
730
|
|
|
551
|
-
template<>
|
|
731
|
+
template <>
|
|
732
|
+
EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
|
|
552
733
|
return pset1_size4<Packet4i>(from);
|
|
553
734
|
}
|
|
554
735
|
|
|
555
|
-
template<>
|
|
736
|
+
template <>
|
|
737
|
+
EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
|
|
556
738
|
return pset1_size8<Packet8s>(from);
|
|
557
739
|
}
|
|
558
740
|
|
|
559
|
-
template<>
|
|
741
|
+
template <>
|
|
742
|
+
EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
|
|
560
743
|
return pset1_size8<Packet8us>(from);
|
|
561
744
|
}
|
|
562
745
|
|
|
563
|
-
template<>
|
|
746
|
+
template <>
|
|
747
|
+
EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
|
|
564
748
|
return pset1_size16<Packet16c>(from);
|
|
565
749
|
}
|
|
566
750
|
|
|
567
|
-
template<>
|
|
751
|
+
template <>
|
|
752
|
+
EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
|
|
568
753
|
return pset1_size16<Packet16uc>(from);
|
|
569
754
|
}
|
|
570
755
|
|
|
571
|
-
template<>
|
|
756
|
+
template <>
|
|
757
|
+
EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
|
|
572
758
|
return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
|
|
573
759
|
}
|
|
574
760
|
|
|
575
|
-
template<>
|
|
761
|
+
template <>
|
|
762
|
+
EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
|
|
576
763
|
return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
|
|
577
764
|
}
|
|
578
765
|
|
|
579
|
-
template<typename Packet>
|
|
580
|
-
pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
|
|
581
|
-
|
|
582
|
-
{
|
|
766
|
+
template <typename Packet>
|
|
767
|
+
EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
|
|
768
|
+
Packet& a3) {
|
|
583
769
|
a3 = pload<Packet>(a);
|
|
584
770
|
a0 = vec_splat(a3, 0);
|
|
585
771
|
a1 = vec_splat(a3, 1);
|
|
@@ -587,781 +773,1514 @@ pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
|
|
|
587
773
|
a3 = vec_splat(a3, 3);
|
|
588
774
|
}
|
|
589
775
|
|
|
590
|
-
template<>
|
|
591
|
-
pbroadcast4<Packet4f>(const float
|
|
592
|
-
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
|
593
|
-
{
|
|
776
|
+
template <>
|
|
777
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
|
|
594
778
|
pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
|
|
595
779
|
}
|
|
596
|
-
template<>
|
|
597
|
-
pbroadcast4<Packet4i>(const int
|
|
598
|
-
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
|
|
599
|
-
{
|
|
780
|
+
template <>
|
|
781
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
|
|
600
782
|
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
|
|
601
783
|
}
|
|
602
784
|
|
|
603
|
-
template<typename Packet>
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
785
|
+
template <typename Packet>
|
|
786
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
|
|
787
|
+
const Index n = unpacket_traits<Packet>::size) {
|
|
788
|
+
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
|
|
789
|
+
eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
|
|
790
|
+
if (stride == 1) {
|
|
791
|
+
if (n == unpacket_traits<Packet>::size) {
|
|
792
|
+
return ploadu<Packet>(from);
|
|
793
|
+
} else {
|
|
794
|
+
return ploadu_partial<Packet>(from, n);
|
|
795
|
+
}
|
|
796
|
+
} else {
|
|
797
|
+
LOAD_STORE_UNROLL_16
|
|
798
|
+
for (Index i = 0; i < n; i++) {
|
|
799
|
+
a[i] = from[i * stride];
|
|
800
|
+
}
|
|
801
|
+
// Leave rest of the array uninitialized
|
|
802
|
+
return pload_ignore<Packet>(a);
|
|
803
|
+
}
|
|
611
804
|
}
|
|
612
805
|
|
|
613
|
-
template<>
|
|
614
|
-
{
|
|
806
|
+
template <>
|
|
807
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
|
615
808
|
return pgather_common<Packet4f>(from, stride);
|
|
616
809
|
}
|
|
617
810
|
|
|
618
|
-
template<>
|
|
619
|
-
{
|
|
811
|
+
template <>
|
|
812
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
|
|
620
813
|
return pgather_common<Packet4i>(from, stride);
|
|
621
814
|
}
|
|
622
815
|
|
|
623
|
-
template
|
|
624
|
-
{
|
|
625
|
-
|
|
626
|
-
a[0] = from[0*stride];
|
|
627
|
-
a[1] = from[1*stride];
|
|
628
|
-
a[2] = from[2*stride];
|
|
629
|
-
a[3] = from[3*stride];
|
|
630
|
-
a[4] = from[4*stride];
|
|
631
|
-
a[5] = from[5*stride];
|
|
632
|
-
a[6] = from[6*stride];
|
|
633
|
-
a[7] = from[7*stride];
|
|
634
|
-
return pload<Packet>(a);
|
|
816
|
+
template <>
|
|
817
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
|
|
818
|
+
return pgather_common<Packet8s>(from, stride);
|
|
635
819
|
}
|
|
636
820
|
|
|
637
|
-
template<>
|
|
638
|
-
|
|
639
|
-
|
|
821
|
+
template <>
|
|
822
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
|
|
823
|
+
Index stride) {
|
|
824
|
+
return pgather_common<Packet8us>(from, stride);
|
|
640
825
|
}
|
|
641
826
|
|
|
642
|
-
template<>
|
|
643
|
-
{
|
|
644
|
-
return
|
|
827
|
+
template <>
|
|
828
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
|
|
829
|
+
return pgather_common<Packet8bf>(from, stride);
|
|
645
830
|
}
|
|
646
831
|
|
|
647
|
-
template<>
|
|
648
|
-
{
|
|
649
|
-
return
|
|
832
|
+
template <>
|
|
833
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
|
|
834
|
+
return pgather_common<Packet16c>(from, stride);
|
|
650
835
|
}
|
|
651
836
|
|
|
652
|
-
template
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
a[1] = from[1*stride];
|
|
657
|
-
a[2] = from[2*stride];
|
|
658
|
-
a[3] = from[3*stride];
|
|
659
|
-
a[4] = from[4*stride];
|
|
660
|
-
a[5] = from[5*stride];
|
|
661
|
-
a[6] = from[6*stride];
|
|
662
|
-
a[7] = from[7*stride];
|
|
663
|
-
a[8] = from[8*stride];
|
|
664
|
-
a[9] = from[9*stride];
|
|
665
|
-
a[10] = from[10*stride];
|
|
666
|
-
a[11] = from[11*stride];
|
|
667
|
-
a[12] = from[12*stride];
|
|
668
|
-
a[13] = from[13*stride];
|
|
669
|
-
a[14] = from[14*stride];
|
|
670
|
-
a[15] = from[15*stride];
|
|
671
|
-
return pload<Packet>(a);
|
|
837
|
+
template <>
|
|
838
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
|
|
839
|
+
Index stride) {
|
|
840
|
+
return pgather_common<Packet16uc>(from, stride);
|
|
672
841
|
}
|
|
673
842
|
|
|
843
|
+
template <>
|
|
844
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
|
|
845
|
+
const Index n) {
|
|
846
|
+
return pgather_common<Packet4f>(from, stride, n);
|
|
847
|
+
}
|
|
674
848
|
|
|
675
|
-
template<>
|
|
676
|
-
|
|
677
|
-
|
|
849
|
+
template <>
|
|
850
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
|
|
851
|
+
const Index n) {
|
|
852
|
+
return pgather_common<Packet4i>(from, stride, n);
|
|
678
853
|
}
|
|
679
854
|
|
|
680
|
-
template<>
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
|
|
686
|
-
{
|
|
687
|
-
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
|
|
688
|
-
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
|
689
|
-
to[0*stride] = a[0];
|
|
690
|
-
to[1*stride] = a[1];
|
|
691
|
-
to[2*stride] = a[2];
|
|
692
|
-
to[3*stride] = a[3];
|
|
855
|
+
template <>
|
|
856
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
|
|
857
|
+
const Index n) {
|
|
858
|
+
return pgather_common<Packet8s>(from, stride, n);
|
|
693
859
|
}
|
|
694
860
|
|
|
695
|
-
template<>
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
|
|
701
|
-
{
|
|
702
|
-
pscatter_size4<Packet4i>(to, from, stride);
|
|
861
|
+
template <>
|
|
862
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
|
|
863
|
+
pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
|
|
864
|
+
return pgather_common<Packet8us>(from, stride, n);
|
|
703
865
|
}
|
|
704
|
-
|
|
705
|
-
template
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
to[0*stride] = a[0];
|
|
710
|
-
to[1*stride] = a[1];
|
|
711
|
-
to[2*stride] = a[2];
|
|
712
|
-
to[3*stride] = a[3];
|
|
713
|
-
to[4*stride] = a[4];
|
|
714
|
-
to[5*stride] = a[5];
|
|
715
|
-
to[6*stride] = a[6];
|
|
716
|
-
to[7*stride] = a[7];
|
|
717
|
-
}
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
|
|
721
|
-
{
|
|
722
|
-
pscatter_size8<Packet8s>(to, from, stride);
|
|
723
|
-
}
|
|
724
|
-
|
|
725
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
|
|
726
|
-
{
|
|
727
|
-
pscatter_size8<Packet8us>(to, from, stride);
|
|
728
|
-
}
|
|
729
|
-
|
|
730
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
|
|
731
|
-
{
|
|
732
|
-
pscatter_size8<Packet8bf>(to, from, stride);
|
|
866
|
+
|
|
867
|
+
template <>
|
|
868
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
|
|
869
|
+
const Index n) {
|
|
870
|
+
return pgather_common<Packet8bf>(from, stride, n);
|
|
733
871
|
}
|
|
734
872
|
|
|
735
|
-
template
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
to[0*stride] = a[0];
|
|
740
|
-
to[1*stride] = a[1];
|
|
741
|
-
to[2*stride] = a[2];
|
|
742
|
-
to[3*stride] = a[3];
|
|
743
|
-
to[4*stride] = a[4];
|
|
744
|
-
to[5*stride] = a[5];
|
|
745
|
-
to[6*stride] = a[6];
|
|
746
|
-
to[7*stride] = a[7];
|
|
747
|
-
to[8*stride] = a[8];
|
|
748
|
-
to[9*stride] = a[9];
|
|
749
|
-
to[10*stride] = a[10];
|
|
750
|
-
to[11*stride] = a[11];
|
|
751
|
-
to[12*stride] = a[12];
|
|
752
|
-
to[13*stride] = a[13];
|
|
753
|
-
to[14*stride] = a[14];
|
|
754
|
-
to[15*stride] = a[15];
|
|
755
|
-
}
|
|
756
|
-
|
|
757
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
|
|
758
|
-
{
|
|
759
|
-
pscatter_size16<Packet16c>(to, from, stride);
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
|
|
763
|
-
{
|
|
764
|
-
pscatter_size16<Packet16uc>(to, from, stride);
|
|
873
|
+
template <>
|
|
874
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
|
|
875
|
+
Index stride, const Index n) {
|
|
876
|
+
return pgather_common<Packet16c>(from, stride, n);
|
|
765
877
|
}
|
|
766
878
|
|
|
767
|
-
template<>
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
879
|
+
template <>
|
|
880
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
|
|
881
|
+
Index stride,
|
|
882
|
+
const Index n) {
|
|
883
|
+
return pgather_common<Packet16uc>(from, stride, n);
|
|
884
|
+
}
|
|
773
885
|
|
|
774
|
-
template
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
886
|
+
template <typename Packet>
|
|
887
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
|
|
888
|
+
Index stride,
|
|
889
|
+
const Index n = unpacket_traits<Packet>::size) {
|
|
890
|
+
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
|
|
891
|
+
eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
|
|
892
|
+
if (stride == 1) {
|
|
893
|
+
if (n == unpacket_traits<Packet>::size) {
|
|
894
|
+
return pstoreu(to, from);
|
|
895
|
+
} else {
|
|
896
|
+
return pstoreu_partial(to, from, n);
|
|
897
|
+
}
|
|
898
|
+
} else {
|
|
899
|
+
pstore<__UNPACK_TYPE__(Packet)>(a, from);
|
|
900
|
+
LOAD_STORE_UNROLL_16
|
|
901
|
+
for (Index i = 0; i < n; i++) {
|
|
902
|
+
to[i * stride] = a[i];
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
}
|
|
791
906
|
|
|
792
|
-
template<>
|
|
793
|
-
|
|
907
|
+
template <>
|
|
908
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
|
909
|
+
pscatter_common<Packet4f>(to, from, stride);
|
|
910
|
+
}
|
|
794
911
|
|
|
795
|
-
template<>
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
|
|
800
|
-
template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
804
|
-
{
|
|
805
|
-
#ifndef __VSX__ // VSX actually provides a div instruction
|
|
806
|
-
Packet4f t, y_0, y_1;
|
|
912
|
+
template <>
|
|
913
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
|
|
914
|
+
pscatter_common<Packet4i>(to, from, stride);
|
|
915
|
+
}
|
|
807
916
|
|
|
808
|
-
|
|
809
|
-
|
|
917
|
+
template <>
|
|
918
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
|
|
919
|
+
Index stride) {
|
|
920
|
+
pscatter_common<Packet8s>(to, from, stride);
|
|
921
|
+
}
|
|
810
922
|
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
923
|
+
template <>
|
|
924
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
|
|
925
|
+
const Packet8us& from,
|
|
926
|
+
Index stride) {
|
|
927
|
+
pscatter_common<Packet8us>(to, from, stride);
|
|
928
|
+
}
|
|
814
929
|
|
|
815
|
-
|
|
930
|
+
template <>
|
|
931
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
|
|
932
|
+
Index stride) {
|
|
933
|
+
pscatter_common<Packet8bf>(to, from, stride);
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
template <>
|
|
937
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
|
|
938
|
+
Index stride) {
|
|
939
|
+
pscatter_common<Packet16c>(to, from, stride);
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
template <>
|
|
943
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
|
|
944
|
+
const Packet16uc& from, Index stride) {
|
|
945
|
+
pscatter_common<Packet16uc>(to, from, stride);
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
template <>
|
|
949
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
|
|
950
|
+
Index stride, const Index n) {
|
|
951
|
+
pscatter_common<Packet4f>(to, from, stride, n);
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
template <>
|
|
955
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
|
|
956
|
+
const Index n) {
|
|
957
|
+
pscatter_common<Packet4i>(to, from, stride, n);
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
template <>
|
|
961
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
|
|
962
|
+
Index stride, const Index n) {
|
|
963
|
+
pscatter_common<Packet8s>(to, from, stride, n);
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
template <>
|
|
967
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
|
|
968
|
+
const Packet8us& from,
|
|
969
|
+
Index stride,
|
|
970
|
+
const Index n) {
|
|
971
|
+
pscatter_common<Packet8us>(to, from, stride, n);
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
template <>
|
|
975
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
|
|
976
|
+
Index stride, const Index n) {
|
|
977
|
+
pscatter_common<Packet8bf>(to, from, stride, n);
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
template <>
|
|
981
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
|
|
982
|
+
const Packet16c& from, Index stride,
|
|
983
|
+
const Index n) {
|
|
984
|
+
pscatter_common<Packet16c>(to, from, stride, n);
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
template <>
|
|
988
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
|
|
989
|
+
const Packet16uc& from,
|
|
990
|
+
Index stride, const Index n) {
|
|
991
|
+
pscatter_common<Packet16uc>(to, from, stride, n);
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
template <>
|
|
995
|
+
EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
|
|
996
|
+
return pset1<Packet4f>(a) + p4f_COUNTDOWN;
|
|
997
|
+
}
|
|
998
|
+
template <>
|
|
999
|
+
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
|
|
1000
|
+
return pset1<Packet4i>(a) + p4i_COUNTDOWN;
|
|
1001
|
+
}
|
|
1002
|
+
template <>
|
|
1003
|
+
EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
|
|
1004
|
+
return pset1<Packet8s>(a) + p8s_COUNTDOWN;
|
|
1005
|
+
}
|
|
1006
|
+
template <>
|
|
1007
|
+
EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
|
|
1008
|
+
return pset1<Packet8us>(a) + p8us_COUNTDOWN;
|
|
1009
|
+
}
|
|
1010
|
+
template <>
|
|
1011
|
+
EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
|
|
1012
|
+
return pset1<Packet16c>(a) + p16c_COUNTDOWN;
|
|
1013
|
+
}
|
|
1014
|
+
template <>
|
|
1015
|
+
EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
|
|
1016
|
+
return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
template <>
|
|
1020
|
+
EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1021
|
+
return a + b;
|
|
1022
|
+
}
|
|
1023
|
+
template <>
|
|
1024
|
+
EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1025
|
+
return a + b;
|
|
1026
|
+
}
|
|
1027
|
+
template <>
|
|
1028
|
+
EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1029
|
+
return a + b;
|
|
1030
|
+
}
|
|
1031
|
+
template <>
|
|
1032
|
+
EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1033
|
+
return a + b;
|
|
1034
|
+
}
|
|
1035
|
+
template <>
|
|
1036
|
+
EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1037
|
+
return a + b;
|
|
1038
|
+
}
|
|
1039
|
+
template <>
|
|
1040
|
+
EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1041
|
+
return a + b;
|
|
1042
|
+
}
|
|
1043
|
+
template <>
|
|
1044
|
+
EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1045
|
+
return a + b;
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
template <>
|
|
1049
|
+
EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1050
|
+
return a - b;
|
|
1051
|
+
}
|
|
1052
|
+
template <>
|
|
1053
|
+
EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1054
|
+
return a - b;
|
|
1055
|
+
}
|
|
1056
|
+
template <>
|
|
1057
|
+
EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1058
|
+
return a - b;
|
|
1059
|
+
}
|
|
1060
|
+
template <>
|
|
1061
|
+
EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1062
|
+
return a - b;
|
|
1063
|
+
}
|
|
1064
|
+
template <>
|
|
1065
|
+
EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1066
|
+
return a - b;
|
|
1067
|
+
}
|
|
1068
|
+
template <>
|
|
1069
|
+
EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1070
|
+
return a - b;
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
template <>
|
|
1074
|
+
EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
|
|
1075
|
+
#ifdef __POWER8_VECTOR__
|
|
1076
|
+
return vec_neg(a);
|
|
1077
|
+
#else
|
|
1078
|
+
return vec_xor(a, p4f_MZERO);
|
|
1079
|
+
#endif
|
|
1080
|
+
}
|
|
1081
|
+
template <>
|
|
1082
|
+
EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
|
|
1083
|
+
#ifdef __POWER8_VECTOR__
|
|
1084
|
+
return vec_neg(a);
|
|
1085
|
+
#else
|
|
1086
|
+
return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
|
|
1087
|
+
#endif
|
|
1088
|
+
}
|
|
1089
|
+
template <>
|
|
1090
|
+
EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
|
|
1091
|
+
#ifdef __POWER8_VECTOR__
|
|
1092
|
+
return vec_neg(a);
|
|
1093
|
+
#else
|
|
1094
|
+
return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
|
|
1095
|
+
#endif
|
|
1096
|
+
}
|
|
1097
|
+
template <>
|
|
1098
|
+
EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
|
|
1099
|
+
#ifdef __POWER8_VECTOR__
|
|
1100
|
+
return vec_neg(a);
|
|
1101
|
+
#else
|
|
1102
|
+
return p4i_ZERO - a;
|
|
1103
|
+
#endif
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
template <>
|
|
1107
|
+
EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
|
|
1108
|
+
return a;
|
|
1109
|
+
}
|
|
1110
|
+
template <>
|
|
1111
|
+
EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
|
|
1112
|
+
return a;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
template <>
|
|
1116
|
+
EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1117
|
+
return vec_madd(a, b, p4f_MZERO);
|
|
1118
|
+
}
|
|
1119
|
+
template <>
|
|
1120
|
+
EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1121
|
+
return a * b;
|
|
1122
|
+
}
|
|
1123
|
+
template <>
|
|
1124
|
+
EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1125
|
+
return vec_mul(a, b);
|
|
1126
|
+
}
|
|
1127
|
+
template <>
|
|
1128
|
+
EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1129
|
+
return vec_mul(a, b);
|
|
1130
|
+
}
|
|
1131
|
+
template <>
|
|
1132
|
+
EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1133
|
+
return vec_mul(a, b);
|
|
1134
|
+
}
|
|
1135
|
+
template <>
|
|
1136
|
+
EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1137
|
+
return vec_mul(a, b);
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
template <>
|
|
1141
|
+
EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1142
|
+
#ifndef __VSX__ // VSX actually provides a div instruction
|
|
1143
|
+
Packet4f t, y_0, y_1;
|
|
1144
|
+
|
|
1145
|
+
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
|
|
1146
|
+
y_0 = vec_re(b);
|
|
1147
|
+
|
|
1148
|
+
// Do one Newton-Raphson iteration to get the needed accuracy
|
|
1149
|
+
t = vec_nmsub(y_0, b, p4f_ONE);
|
|
1150
|
+
y_1 = vec_madd(y_0, t, y_0);
|
|
1151
|
+
|
|
1152
|
+
return vec_madd(a, y_1, p4f_MZERO);
|
|
816
1153
|
#else
|
|
817
1154
|
return vec_div(a, b);
|
|
818
1155
|
#endif
|
|
819
1156
|
}
|
|
820
1157
|
|
|
821
|
-
template<>
|
|
822
|
-
|
|
1158
|
+
template <>
|
|
1159
|
+
EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1160
|
+
#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
|
|
1161
|
+
return vec_div(a, b);
|
|
1162
|
+
#else
|
|
1163
|
+
EIGEN_UNUSED_VARIABLE(a);
|
|
1164
|
+
EIGEN_UNUSED_VARIABLE(b);
|
|
1165
|
+
eigen_assert(false && "packet integer division are not supported by AltiVec");
|
|
823
1166
|
return pset1<Packet4i>(0);
|
|
1167
|
+
#endif
|
|
824
1168
|
}
|
|
825
1169
|
|
|
826
1170
|
// for some weird raisons, it has to be overloaded for packet of integers
|
|
827
|
-
template<>
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
1171
|
+
template <>
|
|
1172
|
+
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
1173
|
+
return vec_madd(a, b, c);
|
|
1174
|
+
}
|
|
1175
|
+
template <>
|
|
1176
|
+
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
1177
|
+
return a * b + c;
|
|
1178
|
+
}
|
|
1179
|
+
template <>
|
|
1180
|
+
EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
|
|
1181
|
+
return vec_madd(a, b, c);
|
|
1182
|
+
}
|
|
1183
|
+
template <>
|
|
1184
|
+
EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
|
|
1185
|
+
return vec_madd(a, b, c);
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1189
|
+
template <>
|
|
1190
|
+
EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
1191
|
+
return vec_msub(a, b, c);
|
|
1192
|
+
}
|
|
1193
|
+
template <>
|
|
1194
|
+
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
1195
|
+
return vec_nmsub(a, b, c);
|
|
1196
|
+
}
|
|
1197
|
+
template <>
|
|
1198
|
+
EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
1199
|
+
return vec_nmadd(a, b, c);
|
|
1200
|
+
}
|
|
1201
|
+
#endif
|
|
1202
|
+
|
|
1203
|
+
template <>
|
|
1204
|
+
EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1205
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
835
1206
|
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
|
|
836
1207
|
Packet4f ret;
|
|
837
|
-
__asm__
|
|
1208
|
+
__asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
|
|
838
1209
|
return ret;
|
|
839
|
-
|
|
1210
|
+
#else
|
|
1211
|
+
return vec_min(a, b);
|
|
1212
|
+
#endif
|
|
1213
|
+
}
|
|
1214
|
+
template <>
|
|
1215
|
+
EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1216
|
+
return vec_min(a, b);
|
|
1217
|
+
}
|
|
1218
|
+
template <>
|
|
1219
|
+
EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1220
|
+
return vec_min(a, b);
|
|
1221
|
+
}
|
|
1222
|
+
template <>
|
|
1223
|
+
EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1224
|
+
return vec_min(a, b);
|
|
1225
|
+
}
|
|
1226
|
+
template <>
|
|
1227
|
+
EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1228
|
+
return vec_min(a, b);
|
|
1229
|
+
}
|
|
1230
|
+
template <>
|
|
1231
|
+
EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
840
1232
|
return vec_min(a, b);
|
|
841
|
-
#endif
|
|
842
1233
|
}
|
|
843
|
-
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
|
|
844
|
-
template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
|
|
845
|
-
template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
|
|
846
|
-
template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
|
|
847
|
-
template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
|
|
848
|
-
|
|
849
1234
|
|
|
850
|
-
template<>
|
|
851
|
-
{
|
|
852
|
-
|
|
1235
|
+
template <>
|
|
1236
|
+
EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1237
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
853
1238
|
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
|
|
854
1239
|
Packet4f ret;
|
|
855
|
-
__asm__
|
|
1240
|
+
__asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
|
|
856
1241
|
return ret;
|
|
857
|
-
|
|
1242
|
+
#else
|
|
858
1243
|
return vec_max(a, b);
|
|
859
|
-
|
|
860
|
-
}
|
|
861
|
-
template<>
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
template<>
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
template<>
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
template<>
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
template<>
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
template<>
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
template<>
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
1244
|
+
#endif
|
|
1245
|
+
}
|
|
1246
|
+
template <>
|
|
1247
|
+
EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1248
|
+
return vec_max(a, b);
|
|
1249
|
+
}
|
|
1250
|
+
template <>
|
|
1251
|
+
EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1252
|
+
return vec_max(a, b);
|
|
1253
|
+
}
|
|
1254
|
+
template <>
|
|
1255
|
+
EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1256
|
+
return vec_max(a, b);
|
|
1257
|
+
}
|
|
1258
|
+
template <>
|
|
1259
|
+
EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1260
|
+
return vec_max(a, b);
|
|
1261
|
+
}
|
|
1262
|
+
template <>
|
|
1263
|
+
EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1264
|
+
return vec_max(a, b);
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
template <>
|
|
1268
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
|
|
1269
|
+
return reinterpret_cast<Packet4f>(vec_cmple(a, b));
|
|
1270
|
+
}
|
|
1271
|
+
// To fix bug with vec_cmplt on older versions
|
|
1272
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1273
|
+
template <>
|
|
1274
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
|
|
1275
|
+
return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
|
|
1276
|
+
}
|
|
1277
|
+
#endif
|
|
1278
|
+
template <>
|
|
1279
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
|
|
1280
|
+
return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
|
|
1281
|
+
}
|
|
1282
|
+
template <>
|
|
1283
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
|
|
1284
|
+
Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
|
|
1285
|
+
return vec_nor(c, c);
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1289
|
+
template <>
|
|
1290
|
+
EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
|
|
1291
|
+
return reinterpret_cast<Packet4i>(vec_cmple(a, b));
|
|
1292
|
+
}
|
|
1293
|
+
#endif
|
|
1294
|
+
template <>
|
|
1295
|
+
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
|
|
1296
|
+
return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
|
|
1297
|
+
}
|
|
1298
|
+
template <>
|
|
1299
|
+
EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
|
|
1300
|
+
return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
|
|
1301
|
+
}
|
|
1302
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1303
|
+
template <>
|
|
1304
|
+
EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
|
|
1305
|
+
return reinterpret_cast<Packet8s>(vec_cmple(a, b));
|
|
1306
|
+
}
|
|
1307
|
+
#endif
|
|
1308
|
+
template <>
|
|
1309
|
+
EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
|
|
1310
|
+
return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
|
|
1311
|
+
}
|
|
1312
|
+
template <>
|
|
1313
|
+
EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
|
|
1314
|
+
return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
|
|
1315
|
+
}
|
|
1316
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1317
|
+
template <>
|
|
1318
|
+
EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
|
|
1319
|
+
return reinterpret_cast<Packet8us>(vec_cmple(a, b));
|
|
1320
|
+
}
|
|
1321
|
+
#endif
|
|
1322
|
+
template <>
|
|
1323
|
+
EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
|
|
1324
|
+
return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
|
|
1325
|
+
}
|
|
1326
|
+
template <>
|
|
1327
|
+
EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
|
|
1328
|
+
return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
|
|
1329
|
+
}
|
|
1330
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1331
|
+
template <>
|
|
1332
|
+
EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
|
|
1333
|
+
return reinterpret_cast<Packet16c>(vec_cmple(a, b));
|
|
1334
|
+
}
|
|
1335
|
+
#endif
|
|
1336
|
+
template <>
|
|
1337
|
+
EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
|
|
1338
|
+
return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
|
|
1339
|
+
}
|
|
1340
|
+
template <>
|
|
1341
|
+
EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
|
|
1342
|
+
return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
|
|
1343
|
+
}
|
|
1344
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1345
|
+
template <>
|
|
1346
|
+
EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
|
|
1347
|
+
return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
|
|
1348
|
+
}
|
|
1349
|
+
#endif
|
|
1350
|
+
template <>
|
|
1351
|
+
EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
|
|
1352
|
+
return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
|
|
1353
|
+
}
|
|
1354
|
+
template <>
|
|
1355
|
+
EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
|
|
1356
|
+
return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
|
|
897
1357
|
}
|
|
898
1358
|
|
|
1359
|
+
template <>
|
|
1360
|
+
EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1361
|
+
return vec_and(a, b);
|
|
1362
|
+
}
|
|
1363
|
+
template <>
|
|
1364
|
+
EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1365
|
+
return vec_and(a, b);
|
|
1366
|
+
}
|
|
1367
|
+
template <>
|
|
1368
|
+
EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1369
|
+
return vec_and(a, b);
|
|
1370
|
+
}
|
|
1371
|
+
template <>
|
|
1372
|
+
EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1373
|
+
return vec_and(a, b);
|
|
1374
|
+
}
|
|
1375
|
+
template <>
|
|
1376
|
+
EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1377
|
+
return pand<Packet8us>(a, b);
|
|
1378
|
+
}
|
|
899
1379
|
|
|
900
|
-
template<>
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
template<>
|
|
1380
|
+
template <>
|
|
1381
|
+
EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1382
|
+
return vec_or(a, b);
|
|
1383
|
+
}
|
|
1384
|
+
template <>
|
|
1385
|
+
EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1386
|
+
return vec_or(a, b);
|
|
1387
|
+
}
|
|
1388
|
+
template <>
|
|
1389
|
+
EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1390
|
+
return vec_or(a, b);
|
|
1391
|
+
}
|
|
1392
|
+
template <>
|
|
1393
|
+
EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1394
|
+
return vec_or(a, b);
|
|
1395
|
+
}
|
|
1396
|
+
template <>
|
|
1397
|
+
EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
905
1398
|
return por<Packet8us>(a, b);
|
|
906
1399
|
}
|
|
907
1400
|
|
|
908
|
-
template<>
|
|
909
|
-
|
|
910
|
-
|
|
1401
|
+
template <>
|
|
1402
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1403
|
+
return vec_xor(a, b);
|
|
1404
|
+
}
|
|
1405
|
+
template <>
|
|
1406
|
+
EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1407
|
+
return vec_xor(a, b);
|
|
1408
|
+
}
|
|
1409
|
+
template <>
|
|
1410
|
+
EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1411
|
+
return vec_xor(a, b);
|
|
1412
|
+
}
|
|
1413
|
+
template <>
|
|
1414
|
+
EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
911
1415
|
return pxor<Packet8us>(a, b);
|
|
912
1416
|
}
|
|
913
1417
|
|
|
914
|
-
template<>
|
|
915
|
-
|
|
1418
|
+
template <>
|
|
1419
|
+
EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1420
|
+
return vec_andc(a, b);
|
|
1421
|
+
}
|
|
1422
|
+
template <>
|
|
1423
|
+
EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1424
|
+
return vec_andc(a, b);
|
|
1425
|
+
}
|
|
916
1426
|
|
|
917
|
-
template<>
|
|
1427
|
+
template <>
|
|
1428
|
+
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
|
|
918
1429
|
return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
|
|
919
1430
|
}
|
|
920
1431
|
|
|
921
|
-
template<>
|
|
922
|
-
{
|
|
923
|
-
|
|
924
|
-
|
|
1432
|
+
template <>
|
|
1433
|
+
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
|
|
1434
|
+
Packet4f t = vec_add(
|
|
1435
|
+
reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
|
|
1436
|
+
Packet4f res;
|
|
925
1437
|
|
|
926
|
-
#ifdef
|
|
927
|
-
|
|
928
|
-
: "=&wa" (res)
|
|
929
|
-
: "wa" (t));
|
|
1438
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1439
|
+
__asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
|
|
930
1440
|
#else
|
|
931
|
-
|
|
932
|
-
: "=v" (res)
|
|
933
|
-
: "v" (t));
|
|
1441
|
+
__asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
|
|
934
1442
|
#endif
|
|
935
1443
|
|
|
936
|
-
|
|
1444
|
+
return res;
|
|
1445
|
+
}
|
|
1446
|
+
template <>
|
|
1447
|
+
EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
|
|
1448
|
+
return vec_ceil(a);
|
|
1449
|
+
}
|
|
1450
|
+
template <>
|
|
1451
|
+
EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
|
|
1452
|
+
return vec_floor(a);
|
|
1453
|
+
}
|
|
1454
|
+
template <>
|
|
1455
|
+
EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
|
|
1456
|
+
return vec_trunc(a);
|
|
1457
|
+
}
|
|
1458
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
1459
|
+
template <>
|
|
1460
|
+
EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
|
|
1461
|
+
Packet4f res;
|
|
1462
|
+
|
|
1463
|
+
__asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
|
|
1464
|
+
|
|
1465
|
+
return res;
|
|
937
1466
|
}
|
|
938
|
-
|
|
939
|
-
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
|
|
940
|
-
template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
|
|
941
|
-
{
|
|
942
|
-
Packet4f res;
|
|
1467
|
+
#endif
|
|
943
1468
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
1469
|
+
template <typename Packet>
|
|
1470
|
+
EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
|
|
1471
|
+
EIGEN_DEBUG_UNALIGNED_LOAD
|
|
1472
|
+
#if defined(EIGEN_VECTORIZE_VSX)
|
|
1473
|
+
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
|
1474
|
+
#else
|
|
1475
|
+
Packet16uc MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
|
|
1476
|
+
Packet16uc LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
|
|
1477
|
+
Packet16uc mask = vec_lvsl(0, from); // create the permute mask
|
|
1478
|
+
// TODO: Add static_cast here
|
|
1479
|
+
return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
|
|
1480
|
+
#endif
|
|
1481
|
+
}
|
|
947
1482
|
|
|
948
|
-
|
|
1483
|
+
template <>
|
|
1484
|
+
EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
|
1485
|
+
return ploadu_common<Packet4f>(from);
|
|
1486
|
+
}
|
|
1487
|
+
template <>
|
|
1488
|
+
EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
|
|
1489
|
+
return ploadu_common<Packet4i>(from);
|
|
1490
|
+
}
|
|
1491
|
+
template <>
|
|
1492
|
+
EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
|
|
1493
|
+
return ploadu_common<Packet8s>(from);
|
|
1494
|
+
}
|
|
1495
|
+
template <>
|
|
1496
|
+
EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
|
|
1497
|
+
return ploadu_common<Packet8us>(from);
|
|
1498
|
+
}
|
|
1499
|
+
template <>
|
|
1500
|
+
EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
|
|
1501
|
+
return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
|
1502
|
+
}
|
|
1503
|
+
template <>
|
|
1504
|
+
EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
|
|
1505
|
+
return ploadu_common<Packet16c>(from);
|
|
1506
|
+
}
|
|
1507
|
+
template <>
|
|
1508
|
+
EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
|
|
1509
|
+
return ploadu_common<Packet16uc>(from);
|
|
949
1510
|
}
|
|
950
1511
|
|
|
951
|
-
template<typename Packet>
|
|
952
|
-
|
|
1512
|
+
template <typename Packet>
|
|
1513
|
+
EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
|
|
1514
|
+
const Index offset) {
|
|
1515
|
+
const Index packet_size = unpacket_traits<Packet>::size;
|
|
1516
|
+
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
|
|
1517
|
+
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
|
1518
|
+
#ifdef _ARCH_PWR9
|
|
1519
|
+
EIGEN_UNUSED_VARIABLE(packet_size);
|
|
953
1520
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
1521
|
+
EIGEN_DEBUG_UNALIGNED_LOAD
|
|
1522
|
+
Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
|
|
1523
|
+
if (offset) {
|
|
1524
|
+
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
|
|
954
1525
|
#ifdef _BIG_ENDIAN
|
|
955
|
-
|
|
956
|
-
Packet16uc mask;
|
|
957
|
-
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
|
|
958
|
-
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
|
|
959
|
-
mask = vec_lvsl(0, from); // create the permute mask
|
|
960
|
-
//TODO: Add static_cast here
|
|
961
|
-
return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
|
|
1526
|
+
load = Packet(vec_sro(Packet16uc(load), shift));
|
|
962
1527
|
#else
|
|
963
|
-
|
|
964
|
-
|
|
1528
|
+
load = Packet(vec_slo(Packet16uc(load), shift));
|
|
1529
|
+
#endif
|
|
1530
|
+
}
|
|
1531
|
+
return load;
|
|
1532
|
+
#else
|
|
1533
|
+
if (n) {
|
|
1534
|
+
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
|
|
1535
|
+
unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
|
|
1536
|
+
unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
|
|
1537
|
+
Index n2 = n * size;
|
|
1538
|
+
if (16 <= n2) {
|
|
1539
|
+
pstoreu(load2, ploadu<Packet16uc>(from2));
|
|
1540
|
+
} else {
|
|
1541
|
+
memcpy((void*)load2, (void*)from2, n2);
|
|
1542
|
+
}
|
|
1543
|
+
return pload_ignore<Packet>(load);
|
|
1544
|
+
} else {
|
|
1545
|
+
return Packet(pset1<Packet16uc>(0));
|
|
1546
|
+
}
|
|
965
1547
|
#endif
|
|
966
1548
|
}
|
|
967
1549
|
|
|
968
|
-
template<>
|
|
969
|
-
{
|
|
970
|
-
return
|
|
1550
|
+
template <>
|
|
1551
|
+
EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
|
|
1552
|
+
return ploadu_partial_common<Packet4f>(from, n, offset);
|
|
971
1553
|
}
|
|
972
|
-
template<>
|
|
973
|
-
{
|
|
974
|
-
return
|
|
1554
|
+
template <>
|
|
1555
|
+
EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
|
|
1556
|
+
return ploadu_partial_common<Packet4i>(from, n, offset);
|
|
975
1557
|
}
|
|
976
|
-
template<>
|
|
977
|
-
{
|
|
978
|
-
return
|
|
1558
|
+
template <>
|
|
1559
|
+
EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
|
|
1560
|
+
return ploadu_partial_common<Packet8s>(from, n, offset);
|
|
979
1561
|
}
|
|
980
|
-
template<>
|
|
981
|
-
|
|
982
|
-
|
|
1562
|
+
template <>
|
|
1563
|
+
EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
|
|
1564
|
+
const Index offset) {
|
|
1565
|
+
return ploadu_partial_common<Packet8us>(from, n, offset);
|
|
983
1566
|
}
|
|
984
|
-
template<>
|
|
985
|
-
{
|
|
986
|
-
return
|
|
1567
|
+
template <>
|
|
1568
|
+
EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
|
|
1569
|
+
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
|
|
987
1570
|
}
|
|
988
|
-
template<>
|
|
989
|
-
{
|
|
990
|
-
return
|
|
1571
|
+
template <>
|
|
1572
|
+
EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
|
|
1573
|
+
return ploadu_partial_common<Packet16c>(from, n, offset);
|
|
991
1574
|
}
|
|
992
|
-
template<>
|
|
993
|
-
|
|
994
|
-
|
|
1575
|
+
template <>
|
|
1576
|
+
EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
|
|
1577
|
+
const Index offset) {
|
|
1578
|
+
return ploadu_partial_common<Packet16uc>(from, n, offset);
|
|
995
1579
|
}
|
|
996
1580
|
|
|
997
|
-
template<typename Packet>
|
|
998
|
-
{
|
|
1581
|
+
template <typename Packet>
|
|
1582
|
+
EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
|
|
999
1583
|
Packet p;
|
|
1000
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1001
|
-
|
|
1002
|
-
|
|
1584
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1585
|
+
p = pload<Packet>(from);
|
|
1586
|
+
else
|
|
1587
|
+
p = ploadu<Packet>(from);
|
|
1588
|
+
return vec_mergeh(p, p);
|
|
1003
1589
|
}
|
|
1004
|
-
template<>
|
|
1005
|
-
{
|
|
1590
|
+
template <>
|
|
1591
|
+
EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
|
|
1006
1592
|
return ploaddup_common<Packet4f>(from);
|
|
1007
1593
|
}
|
|
1008
|
-
template<>
|
|
1009
|
-
{
|
|
1594
|
+
template <>
|
|
1595
|
+
EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
|
|
1010
1596
|
return ploaddup_common<Packet4i>(from);
|
|
1011
1597
|
}
|
|
1012
1598
|
|
|
1013
|
-
template<>
|
|
1014
|
-
{
|
|
1599
|
+
template <>
|
|
1600
|
+
EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
|
|
1015
1601
|
Packet8s p;
|
|
1016
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1017
|
-
|
|
1018
|
-
|
|
1602
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1603
|
+
p = pload<Packet8s>(from);
|
|
1604
|
+
else
|
|
1605
|
+
p = ploadu<Packet8s>(from);
|
|
1606
|
+
return vec_mergeh(p, p);
|
|
1019
1607
|
}
|
|
1020
1608
|
|
|
1021
|
-
template<>
|
|
1022
|
-
{
|
|
1609
|
+
template <>
|
|
1610
|
+
EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
|
|
1023
1611
|
Packet8us p;
|
|
1024
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1025
|
-
|
|
1026
|
-
|
|
1612
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1613
|
+
p = pload<Packet8us>(from);
|
|
1614
|
+
else
|
|
1615
|
+
p = ploadu<Packet8us>(from);
|
|
1616
|
+
return vec_mergeh(p, p);
|
|
1027
1617
|
}
|
|
1028
1618
|
|
|
1029
|
-
template<>
|
|
1030
|
-
{
|
|
1619
|
+
template <>
|
|
1620
|
+
EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
|
|
1031
1621
|
Packet8s p;
|
|
1032
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1033
|
-
|
|
1622
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1623
|
+
p = pload<Packet8s>(from);
|
|
1624
|
+
else
|
|
1625
|
+
p = ploadu<Packet8s>(from);
|
|
1034
1626
|
return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
|
|
1035
1627
|
}
|
|
1036
1628
|
|
|
1037
|
-
template<>
|
|
1038
|
-
{
|
|
1629
|
+
template <>
|
|
1630
|
+
EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
|
|
1039
1631
|
Packet8us p;
|
|
1040
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1041
|
-
|
|
1632
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1633
|
+
p = pload<Packet8us>(from);
|
|
1634
|
+
else
|
|
1635
|
+
p = ploadu<Packet8us>(from);
|
|
1042
1636
|
return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
|
|
1043
1637
|
}
|
|
1044
1638
|
|
|
1045
|
-
template<>
|
|
1046
|
-
{
|
|
1639
|
+
template <>
|
|
1640
|
+
EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
|
|
1047
1641
|
return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
|
1048
1642
|
}
|
|
1049
1643
|
|
|
1050
|
-
template<>
|
|
1051
|
-
{
|
|
1644
|
+
template <>
|
|
1645
|
+
EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
|
|
1646
|
+
Packet16c p;
|
|
1647
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1648
|
+
p = pload<Packet16c>(from);
|
|
1649
|
+
else
|
|
1650
|
+
p = ploadu<Packet16c>(from);
|
|
1651
|
+
return vec_mergeh(p, p);
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
template <>
|
|
1655
|
+
EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
|
|
1656
|
+
Packet16uc p;
|
|
1657
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1658
|
+
p = pload<Packet16uc>(from);
|
|
1659
|
+
else
|
|
1660
|
+
p = ploadu<Packet16uc>(from);
|
|
1661
|
+
return vec_mergeh(p, p);
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1664
|
+
template <>
|
|
1665
|
+
EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
|
|
1052
1666
|
Packet16c p;
|
|
1053
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1054
|
-
|
|
1055
|
-
|
|
1667
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1668
|
+
p = pload<Packet16c>(from);
|
|
1669
|
+
else
|
|
1670
|
+
p = ploadu<Packet16c>(from);
|
|
1671
|
+
return vec_perm(p, p, p16uc_QUADRUPLICATE16);
|
|
1056
1672
|
}
|
|
1057
1673
|
|
|
1058
|
-
template<>
|
|
1059
|
-
{
|
|
1674
|
+
template <>
|
|
1675
|
+
EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
|
|
1060
1676
|
Packet16uc p;
|
|
1061
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
1062
|
-
|
|
1063
|
-
|
|
1677
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
1678
|
+
p = pload<Packet16uc>(from);
|
|
1679
|
+
else
|
|
1680
|
+
p = ploadu<Packet16uc>(from);
|
|
1681
|
+
return vec_perm(p, p, p16uc_QUADRUPLICATE16);
|
|
1064
1682
|
}
|
|
1065
1683
|
|
|
1066
|
-
template<typename Packet>
|
|
1067
|
-
{
|
|
1684
|
+
template <typename Packet>
|
|
1685
|
+
EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
|
|
1068
1686
|
EIGEN_DEBUG_UNALIGNED_STORE
|
|
1069
|
-
#
|
|
1687
|
+
#if defined(EIGEN_VECTORIZE_VSX)
|
|
1688
|
+
vec_xst(from, 0, to);
|
|
1689
|
+
#else
|
|
1070
1690
|
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
|
1071
1691
|
// Warning: not thread safe!
|
|
1072
1692
|
Packet16uc MSQ, LSQ, edges;
|
|
1073
1693
|
Packet16uc edgeAlign, align;
|
|
1074
1694
|
|
|
1075
|
-
MSQ = vec_ld(0, (unsigned char
|
|
1076
|
-
LSQ = vec_ld(15, (unsigned char
|
|
1077
|
-
edgeAlign = vec_lvsl(0, to);
|
|
1078
|
-
edges=vec_perm(LSQ,MSQ,edgeAlign);
|
|
1079
|
-
align = vec_lvsr(
|
|
1080
|
-
MSQ = vec_perm(edges,(Packet16uc)from,align);
|
|
1081
|
-
LSQ = vec_perm((Packet16uc)from,edges,align);
|
|
1082
|
-
vec_st(
|
|
1083
|
-
vec_st(
|
|
1084
|
-
#else
|
|
1085
|
-
vec_xst(from, 0, to);
|
|
1695
|
+
MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
|
|
1696
|
+
LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
|
|
1697
|
+
edgeAlign = vec_lvsl(0, to); // permute map to extract edges
|
|
1698
|
+
edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
|
|
1699
|
+
align = vec_lvsr(0, to); // permute map to misalign data
|
|
1700
|
+
MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
|
|
1701
|
+
LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
|
|
1702
|
+
vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
|
|
1703
|
+
vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
|
|
1086
1704
|
#endif
|
|
1087
1705
|
}
|
|
1088
|
-
template<>
|
|
1089
|
-
{
|
|
1706
|
+
template <>
|
|
1707
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
|
|
1090
1708
|
pstoreu_common<Packet4f>(to, from);
|
|
1091
1709
|
}
|
|
1092
|
-
template<>
|
|
1093
|
-
{
|
|
1710
|
+
template <>
|
|
1711
|
+
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
|
|
1094
1712
|
pstoreu_common<Packet4i>(to, from);
|
|
1095
1713
|
}
|
|
1096
|
-
template<>
|
|
1097
|
-
{
|
|
1714
|
+
template <>
|
|
1715
|
+
EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
|
|
1098
1716
|
pstoreu_common<Packet8s>(to, from);
|
|
1099
1717
|
}
|
|
1100
|
-
template<>
|
|
1101
|
-
{
|
|
1718
|
+
template <>
|
|
1719
|
+
EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
|
|
1102
1720
|
pstoreu_common<Packet8us>(to, from);
|
|
1103
1721
|
}
|
|
1104
|
-
template<>
|
|
1105
|
-
{
|
|
1106
|
-
pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
|
|
1722
|
+
template <>
|
|
1723
|
+
EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
|
|
1724
|
+
pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
|
|
1107
1725
|
}
|
|
1108
|
-
template<>
|
|
1109
|
-
{
|
|
1726
|
+
template <>
|
|
1727
|
+
EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
|
|
1110
1728
|
pstoreu_common<Packet16c>(to, from);
|
|
1111
1729
|
}
|
|
1112
|
-
template<>
|
|
1113
|
-
{
|
|
1730
|
+
template <>
|
|
1731
|
+
EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
|
|
1114
1732
|
pstoreu_common<Packet16uc>(to, from);
|
|
1115
1733
|
}
|
|
1116
1734
|
|
|
1117
|
-
template
|
|
1118
|
-
|
|
1735
|
+
template <typename Packet>
|
|
1736
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
|
|
1737
|
+
const Index offset) {
|
|
1738
|
+
const Index packet_size = unpacket_traits<Packet>::size;
|
|
1739
|
+
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
|
|
1740
|
+
const Index size = sizeof(__UNPACK_TYPE__(Packet));
|
|
1741
|
+
#ifdef _ARCH_PWR9
|
|
1742
|
+
EIGEN_UNUSED_VARIABLE(packet_size);
|
|
1743
|
+
EIGEN_DEBUG_UNALIGNED_STORE
|
|
1744
|
+
Packet store = from;
|
|
1745
|
+
if (offset) {
|
|
1746
|
+
Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
|
|
1747
|
+
#ifdef _BIG_ENDIAN
|
|
1748
|
+
store = Packet(vec_slo(Packet16uc(store), shift));
|
|
1749
|
+
#else
|
|
1750
|
+
store = Packet(vec_sro(Packet16uc(store), shift));
|
|
1751
|
+
#endif
|
|
1752
|
+
}
|
|
1753
|
+
vec_xst_len(store, to, n * size);
|
|
1754
|
+
#else
|
|
1755
|
+
if (n) {
|
|
1756
|
+
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
|
|
1757
|
+
pstore(store, from);
|
|
1758
|
+
unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
|
|
1759
|
+
unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
|
|
1760
|
+
Index n2 = n * size;
|
|
1761
|
+
if (16 <= n2) {
|
|
1762
|
+
pstoreu(to2, ploadu<Packet16uc>(store2));
|
|
1763
|
+
} else {
|
|
1764
|
+
memcpy((void*)to2, (void*)store2, n2);
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1767
|
+
#endif
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
template <>
|
|
1771
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
|
|
1772
|
+
pstoreu_partial_common<Packet4f>(to, from, n, offset);
|
|
1773
|
+
}
|
|
1774
|
+
template <>
|
|
1775
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
|
|
1776
|
+
pstoreu_partial_common<Packet4i>(to, from, n, offset);
|
|
1777
|
+
}
|
|
1778
|
+
template <>
|
|
1779
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
|
|
1780
|
+
const Index offset) {
|
|
1781
|
+
pstoreu_partial_common<Packet8s>(to, from, n, offset);
|
|
1782
|
+
}
|
|
1783
|
+
template <>
|
|
1784
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
|
|
1785
|
+
const Index n, const Index offset) {
|
|
1786
|
+
pstoreu_partial_common<Packet8us>(to, from, n, offset);
|
|
1787
|
+
}
|
|
1788
|
+
template <>
|
|
1789
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
|
|
1790
|
+
const Index offset) {
|
|
1791
|
+
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
|
|
1792
|
+
}
|
|
1793
|
+
template <>
|
|
1794
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
|
|
1795
|
+
const Index offset) {
|
|
1796
|
+
pstoreu_partial_common<Packet16c>(to, from, n, offset);
|
|
1797
|
+
}
|
|
1798
|
+
template <>
|
|
1799
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
|
|
1800
|
+
const Index offset) {
|
|
1801
|
+
pstoreu_partial_common<Packet16uc>(to, from, n, offset);
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1804
|
+
template <>
|
|
1805
|
+
EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
|
|
1806
|
+
EIGEN_PPC_PREFETCH(addr);
|
|
1807
|
+
}
|
|
1808
|
+
template <>
|
|
1809
|
+
EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
|
|
1810
|
+
EIGEN_PPC_PREFETCH(addr);
|
|
1811
|
+
}
|
|
1119
1812
|
|
|
1120
|
-
template<>
|
|
1121
|
-
|
|
1813
|
+
template <>
|
|
1814
|
+
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
|
1815
|
+
EIGEN_ALIGN16 float x;
|
|
1816
|
+
vec_ste(a, 0, &x);
|
|
1817
|
+
return x;
|
|
1818
|
+
}
|
|
1819
|
+
template <>
|
|
1820
|
+
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
|
1821
|
+
EIGEN_ALIGN16 int x;
|
|
1822
|
+
vec_ste(a, 0, &x);
|
|
1823
|
+
return x;
|
|
1824
|
+
}
|
|
1122
1825
|
|
|
1123
|
-
template<typename Packet>
|
|
1826
|
+
template <typename Packet>
|
|
1827
|
+
EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
|
|
1124
1828
|
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
|
|
1125
1829
|
vec_ste(a, 0, &x);
|
|
1126
1830
|
return x;
|
|
1127
1831
|
}
|
|
1128
1832
|
|
|
1129
|
-
template<>
|
|
1833
|
+
template <>
|
|
1834
|
+
EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
|
|
1130
1835
|
return pfirst_common<Packet8s>(a);
|
|
1131
1836
|
}
|
|
1132
1837
|
|
|
1133
|
-
template<>
|
|
1838
|
+
template <>
|
|
1839
|
+
EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
|
|
1134
1840
|
return pfirst_common<Packet8us>(a);
|
|
1135
1841
|
}
|
|
1136
1842
|
|
|
1137
|
-
template<>
|
|
1138
|
-
{
|
|
1843
|
+
template <>
|
|
1844
|
+
EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
|
|
1139
1845
|
return pfirst_common<Packet16c>(a);
|
|
1140
1846
|
}
|
|
1141
1847
|
|
|
1142
|
-
template<>
|
|
1143
|
-
{
|
|
1848
|
+
template <>
|
|
1849
|
+
EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
|
|
1144
1850
|
return pfirst_common<Packet16uc>(a);
|
|
1145
1851
|
}
|
|
1146
1852
|
|
|
1147
|
-
template<>
|
|
1148
|
-
{
|
|
1149
|
-
return reinterpret_cast<Packet4f>(
|
|
1853
|
+
template <>
|
|
1854
|
+
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
|
1855
|
+
return reinterpret_cast<Packet4f>(
|
|
1856
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
1150
1857
|
}
|
|
1151
|
-
template<>
|
|
1152
|
-
{
|
|
1153
|
-
return reinterpret_cast<Packet4i>(
|
|
1858
|
+
template <>
|
|
1859
|
+
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
|
|
1860
|
+
return reinterpret_cast<Packet4i>(
|
|
1861
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
1154
1862
|
}
|
|
1155
|
-
template<>
|
|
1156
|
-
{
|
|
1157
|
-
return reinterpret_cast<Packet8s>(
|
|
1863
|
+
template <>
|
|
1864
|
+
EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
|
|
1865
|
+
return reinterpret_cast<Packet8s>(
|
|
1866
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
|
|
1158
1867
|
}
|
|
1159
|
-
template<>
|
|
1160
|
-
{
|
|
1161
|
-
return reinterpret_cast<Packet8us>(
|
|
1868
|
+
template <>
|
|
1869
|
+
EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
|
|
1870
|
+
return reinterpret_cast<Packet8us>(
|
|
1871
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
|
|
1162
1872
|
}
|
|
1163
|
-
template<>
|
|
1164
|
-
{
|
|
1873
|
+
template <>
|
|
1874
|
+
EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
|
|
1165
1875
|
return vec_perm(a, a, p16uc_REVERSE8);
|
|
1166
1876
|
}
|
|
1167
|
-
template<>
|
|
1168
|
-
{
|
|
1877
|
+
template <>
|
|
1878
|
+
EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
|
|
1169
1879
|
return vec_perm(a, a, p16uc_REVERSE8);
|
|
1170
1880
|
}
|
|
1171
|
-
template<>
|
|
1172
|
-
{
|
|
1881
|
+
template <>
|
|
1882
|
+
EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
|
|
1173
1883
|
return preverse<Packet8us>(a);
|
|
1174
1884
|
}
|
|
1175
1885
|
|
|
1176
|
-
template<>
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
template<>
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1886
|
+
template <>
|
|
1887
|
+
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
|
|
1888
|
+
return vec_abs(a);
|
|
1889
|
+
}
|
|
1890
|
+
template <>
|
|
1891
|
+
EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
|
|
1892
|
+
return vec_abs(a);
|
|
1893
|
+
}
|
|
1894
|
+
template <>
|
|
1895
|
+
EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
|
|
1896
|
+
return vec_abs(a);
|
|
1897
|
+
}
|
|
1898
|
+
template <>
|
|
1899
|
+
EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
|
|
1900
|
+
return a;
|
|
1901
|
+
}
|
|
1902
|
+
template <>
|
|
1903
|
+
EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
|
|
1904
|
+
return vec_abs(a);
|
|
1905
|
+
}
|
|
1906
|
+
template <>
|
|
1907
|
+
EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
|
|
1908
|
+
return a;
|
|
1909
|
+
}
|
|
1910
|
+
template <>
|
|
1911
|
+
EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
|
|
1912
|
+
EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
|
|
1184
1913
|
return pand<Packet8us>(p8us_abs_mask, a);
|
|
1185
1914
|
}
|
|
1186
1915
|
|
|
1187
|
-
template
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
template
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1916
|
+
template <>
|
|
1917
|
+
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
|
|
1918
|
+
return vec_sra(a.m_val, vec_splat_u16(15));
|
|
1919
|
+
}
|
|
1920
|
+
template <>
|
|
1921
|
+
EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
|
|
1922
|
+
return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
|
|
1923
|
+
}
|
|
1924
|
+
|
|
1925
|
+
template <int N>
|
|
1926
|
+
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
|
|
1927
|
+
return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
|
|
1928
|
+
}
|
|
1929
|
+
template <int N>
|
|
1930
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
|
|
1931
|
+
return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
|
|
1932
|
+
}
|
|
1933
|
+
template <int N>
|
|
1934
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
|
|
1935
|
+
return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
|
|
1936
|
+
}
|
|
1937
|
+
template <int N>
|
|
1938
|
+
EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
|
|
1939
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
|
|
1196
1940
|
Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
|
|
1197
1941
|
return reinterpret_cast<Packet4f>(r);
|
|
1198
1942
|
}
|
|
1199
1943
|
|
|
1200
|
-
template<int N>
|
|
1201
|
-
{
|
|
1202
|
-
const
|
|
1944
|
+
template <int N>
|
|
1945
|
+
EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
|
|
1946
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
|
|
1203
1947
|
Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
|
|
1204
1948
|
return reinterpret_cast<Packet4f>(r);
|
|
1205
1949
|
}
|
|
1206
1950
|
|
|
1207
|
-
template<int N>
|
|
1208
|
-
{
|
|
1209
|
-
const
|
|
1951
|
+
template <int N>
|
|
1952
|
+
EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
|
|
1953
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
|
|
1210
1954
|
return vec_sr(a, p4ui_mask);
|
|
1211
1955
|
}
|
|
1212
1956
|
|
|
1213
|
-
template<int N>
|
|
1214
|
-
{
|
|
1215
|
-
const
|
|
1957
|
+
template <int N>
|
|
1958
|
+
EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
|
|
1959
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
|
|
1216
1960
|
return vec_sl(a, p4ui_mask);
|
|
1217
1961
|
}
|
|
1218
1962
|
|
|
1219
|
-
template<int N>
|
|
1220
|
-
{
|
|
1221
|
-
const
|
|
1963
|
+
template <int N>
|
|
1964
|
+
EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
|
|
1965
|
+
const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
|
|
1222
1966
|
return vec_sl(a, p8us_mask);
|
|
1223
1967
|
}
|
|
1224
|
-
template<int N>
|
|
1225
|
-
{
|
|
1226
|
-
const
|
|
1968
|
+
template <int N>
|
|
1969
|
+
EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
|
|
1970
|
+
const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
|
|
1227
1971
|
return vec_sr(a, p8us_mask);
|
|
1228
1972
|
}
|
|
1229
1973
|
|
|
1230
|
-
EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
|
|
1974
|
+
EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
|
|
1231
1975
|
return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
|
|
1232
1976
|
}
|
|
1233
1977
|
|
|
1234
|
-
EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
|
|
1235
|
-
const
|
|
1236
|
-
return pand<Packet4f>(
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1978
|
+
EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
|
|
1979
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
|
|
1980
|
+
return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
|
|
1981
|
+
}
|
|
1982
|
+
|
|
1983
|
+
EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
|
|
1984
|
+
#ifdef _BIG_ENDIAN
|
|
1985
|
+
return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
|
|
1986
|
+
#else
|
|
1987
|
+
return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
|
|
1988
|
+
#endif
|
|
1240
1989
|
}
|
|
1241
1990
|
|
|
1242
1991
|
// Simple interleaving of bool masks, prevents true values from being
|
|
1243
1992
|
// converted to NaNs.
|
|
1244
1993
|
EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
|
|
1245
|
-
|
|
1246
|
-
Packet4f bf_odd, bf_even;
|
|
1247
|
-
bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
|
|
1248
|
-
bf_even = plogical_shift_right<16>(even);
|
|
1249
|
-
return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
|
|
1994
|
+
return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
|
|
1250
1995
|
}
|
|
1251
1996
|
|
|
1252
|
-
|
|
1997
|
+
// #define SUPPORT_BF16_SUBNORMALS
|
|
1998
|
+
|
|
1999
|
+
#ifndef __VEC_CLASS_FP_NAN
|
|
2000
|
+
#define __VEC_CLASS_FP_NAN (1 << 6)
|
|
2001
|
+
#endif
|
|
2002
|
+
|
|
2003
|
+
#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
|
|
2004
|
+
#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
|
|
2005
|
+
#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
|
|
2006
|
+
|
|
2007
|
+
#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
|
|
2008
|
+
#endif
|
|
2009
|
+
|
|
2010
|
+
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
|
|
2011
|
+
#ifdef _ARCH_PWR10
|
|
2012
|
+
return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
|
|
2013
|
+
#else
|
|
1253
2014
|
Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
|
|
1254
2015
|
Packet4ui lsb = plogical_shift_right<16>(input);
|
|
1255
2016
|
lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
|
|
1256
2017
|
|
|
1257
|
-
|
|
2018
|
+
EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
|
|
1258
2019
|
Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
|
|
1259
2020
|
input = padd<Packet4ui>(input, rounding_bias);
|
|
1260
2021
|
|
|
1261
|
-
|
|
1262
|
-
|
|
2022
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
|
|
2023
|
+
#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
|
|
2024
|
+
Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
|
|
2025
|
+
input = vec_sel(input, p4ui_nan, nan_selector);
|
|
2026
|
+
|
|
2027
|
+
#ifdef SUPPORT_BF16_SUBNORMALS
|
|
2028
|
+
Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
|
|
2029
|
+
input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
|
|
2030
|
+
#endif
|
|
2031
|
+
#else
|
|
2032
|
+
#ifdef SUPPORT_BF16_SUBNORMALS
|
|
2033
|
+
// Test NaN and Subnormal
|
|
2034
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
|
|
1263
2035
|
Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
|
|
1264
2036
|
|
|
1265
|
-
const
|
|
2037
|
+
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
|
|
1266
2038
|
Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
|
|
1267
2039
|
|
|
1268
|
-
|
|
1269
|
-
Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
|
|
1270
|
-
Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
|
|
1271
|
-
|
|
2040
|
+
Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
|
|
1272
2041
|
Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
|
|
1273
|
-
Packet4ui nan_selector = pandnot<Packet4ui>(
|
|
1274
|
-
reinterpret_cast<Packet4ui>(is_max_exp),
|
|
1275
|
-
reinterpret_cast<Packet4ui>(is_mant_zero)
|
|
1276
|
-
);
|
|
1277
2042
|
|
|
1278
|
-
Packet4ui
|
|
1279
|
-
reinterpret_cast<Packet4ui>(
|
|
1280
|
-
|
|
1281
|
-
);
|
|
2043
|
+
Packet4ui nan_selector =
|
|
2044
|
+
pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
|
|
2045
|
+
|
|
2046
|
+
Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
|
|
2047
|
+
|
|
2048
|
+
Packet4ui subnormal_selector =
|
|
2049
|
+
pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
|
|
1282
2050
|
|
|
1283
|
-
const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
|
|
1284
2051
|
input = vec_sel(input, p4ui_nan, nan_selector);
|
|
1285
2052
|
input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
|
|
1286
|
-
|
|
2053
|
+
#else
|
|
2054
|
+
// Test only NaN
|
|
2055
|
+
Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
|
|
2056
|
+
|
|
2057
|
+
input = vec_sel(p4ui_nan, input, nan_selector);
|
|
2058
|
+
#endif
|
|
2059
|
+
#endif
|
|
1287
2060
|
|
|
1288
2061
|
input = plogical_shift_right<16>(input);
|
|
1289
2062
|
return reinterpret_cast<Packet8us>(input);
|
|
2063
|
+
#endif
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
#ifdef _BIG_ENDIAN
|
|
2067
|
+
/**
|
|
2068
|
+
* Pack the high portion of two float Packets into one bfloat16 Packet
|
|
2069
|
+
*
|
|
2070
|
+
* @tparam lohi to expect either a low & high OR odd & even order
|
|
2071
|
+
*/
|
|
2072
|
+
template <bool lohi>
|
|
2073
|
+
EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
|
|
2074
|
+
if (lohi) {
|
|
2075
|
+
return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
|
|
2076
|
+
} else {
|
|
2077
|
+
return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
|
|
2078
|
+
}
|
|
2079
|
+
}
|
|
2080
|
+
|
|
2081
|
+
/**
|
|
2082
|
+
* Pack the low portion of two float Packets into one bfloat16 Packet
|
|
2083
|
+
*
|
|
2084
|
+
* @param lohi to expect either a low & high OR odd & even order
|
|
2085
|
+
*/
|
|
2086
|
+
template <bool lohi>
|
|
2087
|
+
EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
|
|
2088
|
+
if (lohi) {
|
|
2089
|
+
return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
|
|
2090
|
+
} else {
|
|
2091
|
+
return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
#else
|
|
2095
|
+
template <bool lohi>
|
|
2096
|
+
EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
|
|
2097
|
+
if (lohi) {
|
|
2098
|
+
return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
|
|
2099
|
+
} else {
|
|
2100
|
+
return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
|
|
2101
|
+
}
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
template <bool lohi>
|
|
2105
|
+
EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
|
|
2106
|
+
if (lohi) {
|
|
2107
|
+
return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
|
|
2108
|
+
} else {
|
|
2109
|
+
return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
|
|
2110
|
+
}
|
|
2111
|
+
}
|
|
2112
|
+
#endif
|
|
2113
|
+
|
|
2114
|
+
/**
|
|
2115
|
+
* Convert and pack two float Packets into one bfloat16 Packet
|
|
2116
|
+
*
|
|
2117
|
+
* @tparam lohi to expect either a low & high OR odd & even order
|
|
2118
|
+
*/
|
|
2119
|
+
template <bool lohi = true>
|
|
2120
|
+
EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
|
|
2121
|
+
Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
|
|
2122
|
+
Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
|
|
2123
|
+
|
|
2124
|
+
Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
|
|
2125
|
+
EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
|
|
2126
|
+
lsb = padd<Packet8us>(lsb, p8us_BIAS);
|
|
2127
|
+
lsb = padd<Packet8us>(lsb, p4f2);
|
|
2128
|
+
|
|
2129
|
+
Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
|
|
2130
|
+
Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
|
|
2131
|
+
|
|
2132
|
+
#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
|
|
2133
|
+
Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
|
|
2134
|
+
Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
|
|
2135
|
+
Packet8us nan_selector =
|
|
2136
|
+
Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
|
|
2137
|
+
|
|
2138
|
+
input = vec_sel(input, p8us_BIAS, nan_selector);
|
|
2139
|
+
|
|
2140
|
+
#ifdef SUPPORT_BF16_SUBNORMALS
|
|
2141
|
+
Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
|
|
2142
|
+
Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
|
|
2143
|
+
Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
|
|
2144
|
+
reinterpret_cast<Packet4f>(subnormal_selector_hi));
|
|
2145
|
+
|
|
2146
|
+
input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
|
|
2147
|
+
#endif
|
|
2148
|
+
#else
|
|
2149
|
+
#ifdef SUPPORT_BF16_SUBNORMALS
|
|
2150
|
+
// Test NaN and Subnormal
|
|
2151
|
+
const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
|
|
2152
|
+
Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
|
|
2153
|
+
|
|
2154
|
+
const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
|
|
2155
|
+
Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
|
|
2156
|
+
|
|
2157
|
+
Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
|
|
2158
|
+
Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
|
|
2159
|
+
|
|
2160
|
+
Packet8us nan_selector =
|
|
2161
|
+
pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
|
|
2162
|
+
|
|
2163
|
+
Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
|
|
2164
|
+
|
|
2165
|
+
Packet8us subnormal_selector =
|
|
2166
|
+
pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
|
|
2167
|
+
|
|
2168
|
+
// Using BIAS as NaN (since any or all of the last 7 bits can be set)
|
|
2169
|
+
input = vec_sel(input, p8us_BIAS, nan_selector);
|
|
2170
|
+
input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
|
|
2171
|
+
#else
|
|
2172
|
+
// Test only NaN
|
|
2173
|
+
Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
|
|
2174
|
+
Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
|
|
2175
|
+
Packet8us nan_selector =
|
|
2176
|
+
Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
|
|
2177
|
+
|
|
2178
|
+
input = vec_sel(p8us_BIAS, input, nan_selector);
|
|
2179
|
+
#endif
|
|
2180
|
+
#endif
|
|
2181
|
+
|
|
2182
|
+
return input;
|
|
2183
|
+
}
|
|
2184
|
+
|
|
2185
|
+
/**
|
|
2186
|
+
* Convert and pack two float Packets into one bfloat16 Packet - low & high order
|
|
2187
|
+
*/
|
|
2188
|
+
EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
|
|
2189
|
+
#ifdef _ARCH_PWR10
|
|
2190
|
+
Packet8bf fp16_0 = F32ToBf16(lo);
|
|
2191
|
+
Packet8bf fp16_1 = F32ToBf16(hi);
|
|
2192
|
+
return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
|
|
2193
|
+
#else
|
|
2194
|
+
return F32ToBf16Two(lo, hi);
|
|
2195
|
+
#endif
|
|
1290
2196
|
}
|
|
1291
2197
|
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
return reinterpret_cast<
|
|
2198
|
+
/**
|
|
2199
|
+
* Convert and pack two float Packets into one bfloat16 Packet - odd & even order
|
|
2200
|
+
*/
|
|
2201
|
+
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
|
|
2202
|
+
#ifdef _ARCH_PWR10
|
|
2203
|
+
return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
|
|
2204
|
+
#else
|
|
2205
|
+
return F32ToBf16Two<false>(even, odd);
|
|
2206
|
+
#endif
|
|
1298
2207
|
}
|
|
1299
2208
|
#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
|
|
1300
|
-
Packet4f a_even = Bf16ToF32Even(A)
|
|
1301
|
-
Packet4f a_odd = Bf16ToF32Odd(A)
|
|
1302
|
-
Packet4f op_even = OP(a_even)
|
|
1303
|
-
Packet4f op_odd = OP(a_odd)
|
|
1304
|
-
return F32ToBf16(op_even, op_odd)
|
|
2209
|
+
Packet4f a_even = Bf16ToF32Even(A); \
|
|
2210
|
+
Packet4f a_odd = Bf16ToF32Odd(A); \
|
|
2211
|
+
Packet4f op_even = OP(a_even); \
|
|
2212
|
+
Packet4f op_odd = OP(a_odd); \
|
|
2213
|
+
return F32ToBf16(op_even, op_odd);
|
|
1305
2214
|
|
|
1306
2215
|
#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
|
|
1307
|
-
Packet4f a_even = Bf16ToF32Even(A)
|
|
1308
|
-
Packet4f a_odd = Bf16ToF32Odd(A)
|
|
1309
|
-
Packet4f b_even = Bf16ToF32Even(B)
|
|
1310
|
-
Packet4f b_odd = Bf16ToF32Odd(B)
|
|
1311
|
-
Packet4f op_even = OP(a_even, b_even)
|
|
1312
|
-
Packet4f op_odd = OP(a_odd, b_odd)
|
|
1313
|
-
return F32ToBf16(op_even, op_odd)
|
|
2216
|
+
Packet4f a_even = Bf16ToF32Even(A); \
|
|
2217
|
+
Packet4f a_odd = Bf16ToF32Odd(A); \
|
|
2218
|
+
Packet4f b_even = Bf16ToF32Even(B); \
|
|
2219
|
+
Packet4f b_odd = Bf16ToF32Odd(B); \
|
|
2220
|
+
Packet4f op_even = OP(a_even, b_even); \
|
|
2221
|
+
Packet4f op_odd = OP(a_odd, b_odd); \
|
|
2222
|
+
return F32ToBf16(op_even, op_odd);
|
|
1314
2223
|
|
|
1315
2224
|
#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
|
|
1316
|
-
Packet4f a_even = Bf16ToF32Even(A)
|
|
1317
|
-
Packet4f a_odd = Bf16ToF32Odd(A)
|
|
1318
|
-
Packet4f b_even = Bf16ToF32Even(B)
|
|
1319
|
-
Packet4f b_odd = Bf16ToF32Odd(B)
|
|
1320
|
-
Packet4f op_even = OP(a_even, b_even)
|
|
1321
|
-
Packet4f op_odd = OP(a_odd, b_odd)
|
|
1322
|
-
return F32ToBf16Bool(op_even, op_odd)
|
|
1323
|
-
|
|
1324
|
-
template<>
|
|
2225
|
+
Packet4f a_even = Bf16ToF32Even(A); \
|
|
2226
|
+
Packet4f a_odd = Bf16ToF32Odd(A); \
|
|
2227
|
+
Packet4f b_even = Bf16ToF32Even(B); \
|
|
2228
|
+
Packet4f b_odd = Bf16ToF32Odd(B); \
|
|
2229
|
+
Packet4f op_even = OP(a_even, b_even); \
|
|
2230
|
+
Packet4f op_odd = OP(a_odd, b_odd); \
|
|
2231
|
+
return F32ToBf16Bool(op_even, op_odd);
|
|
2232
|
+
|
|
2233
|
+
template <>
|
|
2234
|
+
EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1325
2235
|
BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
|
|
1326
2236
|
}
|
|
1327
2237
|
|
|
1328
|
-
template<>
|
|
2238
|
+
template <>
|
|
2239
|
+
EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1329
2240
|
BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
|
|
1330
2241
|
}
|
|
1331
2242
|
|
|
1332
|
-
template<>
|
|
2243
|
+
template <>
|
|
2244
|
+
EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1333
2245
|
BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
|
|
1334
2246
|
}
|
|
1335
2247
|
|
|
1336
|
-
template<>
|
|
1337
|
-
|
|
2248
|
+
template <>
|
|
2249
|
+
EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
|
|
2250
|
+
EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
|
|
2251
|
+
return pxor<Packet8us>(p8us_neg_mask, a);
|
|
1338
2252
|
}
|
|
1339
2253
|
|
|
1340
|
-
template<>
|
|
2254
|
+
template <>
|
|
2255
|
+
EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1341
2256
|
BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
|
|
1342
2257
|
}
|
|
1343
2258
|
|
|
1344
|
-
template<>
|
|
1345
|
-
|
|
1346
|
-
}
|
|
1347
|
-
template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
|
|
1348
|
-
BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
|
|
1349
|
-
}
|
|
1350
|
-
template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
|
|
2259
|
+
template <>
|
|
2260
|
+
EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
|
|
1351
2261
|
BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
|
|
1352
2262
|
}
|
|
1353
2263
|
|
|
1354
|
-
template<>
|
|
1355
|
-
|
|
2264
|
+
template <>
|
|
2265
|
+
EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
|
|
2266
|
+
BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
|
|
2267
|
+
}
|
|
2268
|
+
|
|
2269
|
+
template <>
|
|
2270
|
+
EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
|
|
2271
|
+
return pldexp_generic(a, exponent);
|
|
1356
2272
|
}
|
|
1357
|
-
template<>
|
|
2273
|
+
template <>
|
|
2274
|
+
EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
|
|
1358
2275
|
BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
|
|
1359
2276
|
}
|
|
1360
2277
|
|
|
1361
|
-
template<>
|
|
1362
|
-
|
|
2278
|
+
template <>
|
|
2279
|
+
EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
|
|
2280
|
+
return pfrexp_generic(a, exponent);
|
|
1363
2281
|
}
|
|
1364
|
-
template<>
|
|
2282
|
+
template <>
|
|
2283
|
+
EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
|
|
1365
2284
|
Packet4f a_even = Bf16ToF32Even(a);
|
|
1366
2285
|
Packet4f a_odd = Bf16ToF32Odd(a);
|
|
1367
2286
|
Packet4f e_even;
|
|
@@ -1372,28 +2291,42 @@ template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a,
|
|
|
1372
2291
|
return F32ToBf16(op_even, op_odd);
|
|
1373
2292
|
}
|
|
1374
2293
|
|
|
1375
|
-
template<>
|
|
2294
|
+
template <>
|
|
2295
|
+
EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
|
|
1376
2296
|
BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
|
|
1377
2297
|
}
|
|
1378
|
-
template<>
|
|
2298
|
+
template <>
|
|
2299
|
+
EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
|
|
1379
2300
|
BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
|
|
1380
2301
|
}
|
|
1381
|
-
template<>
|
|
2302
|
+
template <>
|
|
2303
|
+
EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
|
|
1382
2304
|
BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
|
|
1383
2305
|
}
|
|
1384
|
-
template<>
|
|
2306
|
+
template <>
|
|
2307
|
+
EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
|
|
1385
2308
|
BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
|
|
1386
2309
|
}
|
|
1387
|
-
template<>
|
|
2310
|
+
template <>
|
|
2311
|
+
EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
|
|
1388
2312
|
BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
|
|
1389
2313
|
}
|
|
1390
|
-
template<>
|
|
2314
|
+
template <>
|
|
2315
|
+
EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
|
|
1391
2316
|
BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
|
|
1392
2317
|
}
|
|
1393
|
-
template<>
|
|
2318
|
+
template <>
|
|
2319
|
+
EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
|
|
2320
|
+
BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
|
|
2321
|
+
}
|
|
2322
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
2323
|
+
template <>
|
|
2324
|
+
EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
|
|
1394
2325
|
BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
|
|
1395
2326
|
}
|
|
1396
|
-
|
|
2327
|
+
#endif
|
|
2328
|
+
template <>
|
|
2329
|
+
EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
|
|
1397
2330
|
Packet4f a_even = Bf16ToF32Even(a);
|
|
1398
2331
|
Packet4f a_odd = Bf16ToF32Odd(a);
|
|
1399
2332
|
Packet4f b_even = Bf16ToF32Even(b);
|
|
@@ -1405,147 +2338,191 @@ template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8
|
|
|
1405
2338
|
return F32ToBf16(pmadd_even, pmadd_odd);
|
|
1406
2339
|
}
|
|
1407
2340
|
|
|
1408
|
-
template<>
|
|
2341
|
+
template <>
|
|
2342
|
+
EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
|
|
2343
|
+
Packet4f a_even = Bf16ToF32Even(a);
|
|
2344
|
+
Packet4f a_odd = Bf16ToF32Odd(a);
|
|
2345
|
+
Packet4f b_even = Bf16ToF32Even(b);
|
|
2346
|
+
Packet4f b_odd = Bf16ToF32Odd(b);
|
|
2347
|
+
Packet4f c_even = Bf16ToF32Even(c);
|
|
2348
|
+
Packet4f c_odd = Bf16ToF32Odd(c);
|
|
2349
|
+
Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
|
|
2350
|
+
Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
|
|
2351
|
+
return F32ToBf16(pmadd_even, pmadd_odd);
|
|
2352
|
+
}
|
|
2353
|
+
template <>
|
|
2354
|
+
EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
|
|
2355
|
+
Packet4f a_even = Bf16ToF32Even(a);
|
|
2356
|
+
Packet4f a_odd = Bf16ToF32Odd(a);
|
|
2357
|
+
Packet4f b_even = Bf16ToF32Even(b);
|
|
2358
|
+
Packet4f b_odd = Bf16ToF32Odd(b);
|
|
2359
|
+
Packet4f c_even = Bf16ToF32Even(c);
|
|
2360
|
+
Packet4f c_odd = Bf16ToF32Odd(c);
|
|
2361
|
+
Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
|
|
2362
|
+
Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
|
|
2363
|
+
return F32ToBf16(pmadd_even, pmadd_odd);
|
|
2364
|
+
}
|
|
2365
|
+
|
|
2366
|
+
template <>
|
|
2367
|
+
EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
|
|
2368
|
+
Packet4f a_even = Bf16ToF32Even(a);
|
|
2369
|
+
Packet4f a_odd = Bf16ToF32Odd(a);
|
|
2370
|
+
Packet4f b_even = Bf16ToF32Even(b);
|
|
2371
|
+
Packet4f b_odd = Bf16ToF32Odd(b);
|
|
2372
|
+
Packet4f c_even = Bf16ToF32Even(c);
|
|
2373
|
+
Packet4f c_odd = Bf16ToF32Odd(c);
|
|
2374
|
+
Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
|
|
2375
|
+
Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
|
|
2376
|
+
return F32ToBf16(pmadd_even, pmadd_odd);
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2379
|
+
template <>
|
|
2380
|
+
EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1409
2381
|
BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
|
|
1410
2382
|
}
|
|
1411
2383
|
|
|
1412
|
-
template<>
|
|
2384
|
+
template <>
|
|
2385
|
+
EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
|
|
1413
2386
|
BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
|
|
1414
2387
|
}
|
|
1415
2388
|
|
|
1416
|
-
template<>
|
|
2389
|
+
template <>
|
|
2390
|
+
EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
|
|
1417
2391
|
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
|
|
1418
2392
|
}
|
|
1419
|
-
template<>
|
|
2393
|
+
template <>
|
|
2394
|
+
EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
|
|
1420
2395
|
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
|
|
1421
2396
|
}
|
|
1422
|
-
template<>
|
|
2397
|
+
template <>
|
|
2398
|
+
EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
|
|
1423
2399
|
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
|
|
1424
2400
|
}
|
|
1425
|
-
template<>
|
|
2401
|
+
template <>
|
|
2402
|
+
EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
|
|
1426
2403
|
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
|
|
1427
2404
|
}
|
|
1428
2405
|
|
|
1429
|
-
template<>
|
|
2406
|
+
template <>
|
|
2407
|
+
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
|
|
1430
2408
|
return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
|
|
1431
2409
|
}
|
|
1432
2410
|
|
|
1433
|
-
template<>
|
|
1434
|
-
{
|
|
2411
|
+
template <>
|
|
2412
|
+
EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
|
|
1435
2413
|
return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
|
|
1436
2414
|
}
|
|
1437
2415
|
|
|
1438
|
-
template<>
|
|
1439
|
-
|
|
1440
|
-
|
|
2416
|
+
template <>
|
|
2417
|
+
EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
|
|
2418
|
+
bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
|
|
2419
|
+
bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
|
|
1441
2420
|
return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
|
|
1442
2421
|
}
|
|
1443
2422
|
|
|
1444
|
-
template<>
|
|
1445
|
-
{
|
|
2423
|
+
template <>
|
|
2424
|
+
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
|
|
1446
2425
|
Packet4f b, sum;
|
|
1447
|
-
b
|
|
2426
|
+
b = vec_sld(a, a, 8);
|
|
1448
2427
|
sum = a + b;
|
|
1449
|
-
b
|
|
2428
|
+
b = vec_sld(sum, sum, 4);
|
|
1450
2429
|
sum += b;
|
|
1451
2430
|
return pfirst(sum);
|
|
1452
2431
|
}
|
|
1453
2432
|
|
|
1454
|
-
template<>
|
|
1455
|
-
{
|
|
1456
|
-
Packet4i sum;
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
sum = vec_sld(p4i_ZERO, sum, 4);
|
|
1462
|
-
#endif
|
|
2433
|
+
template <>
|
|
2434
|
+
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
|
|
2435
|
+
Packet4i b, sum;
|
|
2436
|
+
b = vec_sld(a, a, 8);
|
|
2437
|
+
sum = a + b;
|
|
2438
|
+
b = vec_sld(sum, sum, 4);
|
|
2439
|
+
sum += b;
|
|
1463
2440
|
return pfirst(sum);
|
|
1464
2441
|
}
|
|
1465
2442
|
|
|
1466
|
-
template<>
|
|
1467
|
-
{
|
|
2443
|
+
template <>
|
|
2444
|
+
EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
|
|
1468
2445
|
float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
|
|
1469
|
-
float redux_odd
|
|
2446
|
+
float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
|
|
1470
2447
|
float f32_result = redux_even + redux_odd;
|
|
1471
2448
|
return bfloat16(f32_result);
|
|
1472
2449
|
}
|
|
1473
|
-
template<typename Packet>
|
|
1474
|
-
{
|
|
1475
|
-
union{
|
|
2450
|
+
template <typename Packet>
|
|
2451
|
+
EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
|
|
2452
|
+
union {
|
|
1476
2453
|
Packet v;
|
|
1477
2454
|
__UNPACK_TYPE__(Packet) n[8];
|
|
1478
2455
|
} vt;
|
|
1479
2456
|
vt.v = a;
|
|
1480
2457
|
|
|
1481
|
-
EIGEN_ALIGN16 int first_loader[4] = {
|
|
1482
|
-
EIGEN_ALIGN16 int second_loader[4] = {
|
|
1483
|
-
Packet4i first_half
|
|
2458
|
+
EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
|
|
2459
|
+
EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
|
|
2460
|
+
Packet4i first_half = pload<Packet4i>(first_loader);
|
|
1484
2461
|
Packet4i second_half = pload<Packet4i>(second_loader);
|
|
1485
2462
|
|
|
1486
2463
|
return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
|
|
1487
2464
|
}
|
|
1488
2465
|
|
|
1489
|
-
template<>
|
|
1490
|
-
{
|
|
2466
|
+
template <>
|
|
2467
|
+
EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
|
|
1491
2468
|
return predux_size8<Packet8s>(a);
|
|
1492
2469
|
}
|
|
1493
2470
|
|
|
1494
|
-
template<>
|
|
1495
|
-
{
|
|
2471
|
+
template <>
|
|
2472
|
+
EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
|
|
1496
2473
|
return predux_size8<Packet8us>(a);
|
|
1497
2474
|
}
|
|
1498
2475
|
|
|
1499
|
-
template<typename Packet>
|
|
1500
|
-
{
|
|
1501
|
-
union{
|
|
2476
|
+
template <typename Packet>
|
|
2477
|
+
EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
|
|
2478
|
+
union {
|
|
1502
2479
|
Packet v;
|
|
1503
2480
|
__UNPACK_TYPE__(Packet) n[16];
|
|
1504
2481
|
} vt;
|
|
1505
2482
|
vt.v = a;
|
|
1506
2483
|
|
|
1507
|
-
EIGEN_ALIGN16 int first_loader[4] = {
|
|
1508
|
-
EIGEN_ALIGN16 int second_loader[4] = {
|
|
1509
|
-
EIGEN_ALIGN16 int third_loader[4] = {
|
|
1510
|
-
EIGEN_ALIGN16 int fourth_loader[4] = {
|
|
2484
|
+
EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
|
|
2485
|
+
EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
|
|
2486
|
+
EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
|
|
2487
|
+
EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
|
|
1511
2488
|
|
|
1512
2489
|
Packet4i first_quarter = pload<Packet4i>(first_loader);
|
|
1513
2490
|
Packet4i second_quarter = pload<Packet4i>(second_loader);
|
|
1514
2491
|
Packet4i third_quarter = pload<Packet4i>(third_loader);
|
|
1515
2492
|
Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
|
|
1516
2493
|
|
|
1517
|
-
return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
|
|
1518
|
-
|
|
2494
|
+
return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
|
|
2495
|
+
predux(fourth_quarter));
|
|
1519
2496
|
}
|
|
1520
2497
|
|
|
1521
|
-
template<>
|
|
1522
|
-
{
|
|
2498
|
+
template <>
|
|
2499
|
+
EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
|
|
1523
2500
|
return predux_size16<Packet16c>(a);
|
|
1524
2501
|
}
|
|
1525
2502
|
|
|
1526
|
-
template<>
|
|
1527
|
-
{
|
|
2503
|
+
template <>
|
|
2504
|
+
EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
|
|
1528
2505
|
return predux_size16<Packet16uc>(a);
|
|
1529
2506
|
}
|
|
1530
2507
|
|
|
1531
2508
|
// Other reduction functions:
|
|
1532
2509
|
// mul
|
|
1533
|
-
template<>
|
|
1534
|
-
{
|
|
2510
|
+
template <>
|
|
2511
|
+
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
|
|
1535
2512
|
Packet4f prod;
|
|
1536
2513
|
prod = pmul(a, vec_sld(a, a, 8));
|
|
1537
2514
|
return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
|
|
1538
2515
|
}
|
|
1539
2516
|
|
|
1540
|
-
template<>
|
|
1541
|
-
{
|
|
2517
|
+
template <>
|
|
2518
|
+
EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
|
|
1542
2519
|
EIGEN_ALIGN16 int aux[4];
|
|
1543
2520
|
pstore(aux, a);
|
|
1544
2521
|
return aux[0] * aux[1] * aux[2] * aux[3];
|
|
1545
2522
|
}
|
|
1546
2523
|
|
|
1547
|
-
template<>
|
|
1548
|
-
{
|
|
2524
|
+
template <>
|
|
2525
|
+
EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
|
|
1549
2526
|
Packet8s pair, quad, octo;
|
|
1550
2527
|
|
|
1551
2528
|
pair = vec_mul(a, vec_sld(a, a, 8));
|
|
@@ -1555,8 +2532,8 @@ template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
|
|
|
1555
2532
|
return pfirst(octo);
|
|
1556
2533
|
}
|
|
1557
2534
|
|
|
1558
|
-
template<>
|
|
1559
|
-
{
|
|
2535
|
+
template <>
|
|
2536
|
+
EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
|
|
1560
2537
|
Packet8us pair, quad, octo;
|
|
1561
2538
|
|
|
1562
2539
|
pair = vec_mul(a, vec_sld(a, a, 8));
|
|
@@ -1566,17 +2543,16 @@ template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Pa
|
|
|
1566
2543
|
return pfirst(octo);
|
|
1567
2544
|
}
|
|
1568
2545
|
|
|
1569
|
-
template<>
|
|
1570
|
-
{
|
|
2546
|
+
template <>
|
|
2547
|
+
EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
|
|
1571
2548
|
float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
|
|
1572
|
-
float redux_odd
|
|
2549
|
+
float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
|
|
1573
2550
|
float f32_result = redux_even * redux_odd;
|
|
1574
2551
|
return bfloat16(f32_result);
|
|
1575
2552
|
}
|
|
1576
2553
|
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
{
|
|
2554
|
+
template <>
|
|
2555
|
+
EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
|
|
1580
2556
|
Packet16c pair, quad, octo, result;
|
|
1581
2557
|
|
|
1582
2558
|
pair = vec_mul(a, vec_sld(a, a, 8));
|
|
@@ -1587,8 +2563,8 @@ template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c
|
|
|
1587
2563
|
return pfirst(result);
|
|
1588
2564
|
}
|
|
1589
2565
|
|
|
1590
|
-
template<>
|
|
1591
|
-
{
|
|
2566
|
+
template <>
|
|
2567
|
+
EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
|
|
1592
2568
|
Packet16uc pair, quad, octo, result;
|
|
1593
2569
|
|
|
1594
2570
|
pair = vec_mul(a, vec_sld(a, a, 8));
|
|
@@ -1600,66 +2576,64 @@ template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet
|
|
|
1600
2576
|
}
|
|
1601
2577
|
|
|
1602
2578
|
// min
|
|
1603
|
-
template<typename Packet>
|
|
1604
|
-
__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
|
|
1605
|
-
{
|
|
2579
|
+
template <typename Packet>
|
|
2580
|
+
EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
|
|
1606
2581
|
Packet b, res;
|
|
1607
2582
|
b = vec_min(a, vec_sld(a, a, 8));
|
|
1608
2583
|
res = vec_min(b, vec_sld(b, b, 4));
|
|
1609
2584
|
return pfirst(res);
|
|
1610
2585
|
}
|
|
1611
2586
|
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
{
|
|
2587
|
+
template <>
|
|
2588
|
+
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
|
|
1615
2589
|
return predux_min4<Packet4f>(a);
|
|
1616
2590
|
}
|
|
1617
2591
|
|
|
1618
|
-
template<>
|
|
1619
|
-
{
|
|
2592
|
+
template <>
|
|
2593
|
+
EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
|
|
1620
2594
|
return predux_min4<Packet4i>(a);
|
|
1621
2595
|
}
|
|
1622
2596
|
|
|
1623
|
-
template<>
|
|
1624
|
-
{
|
|
2597
|
+
template <>
|
|
2598
|
+
EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
|
|
1625
2599
|
float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
|
|
1626
|
-
float redux_odd
|
|
2600
|
+
float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
|
|
1627
2601
|
float f32_result = (std::min)(redux_even, redux_odd);
|
|
1628
2602
|
return bfloat16(f32_result);
|
|
1629
2603
|
}
|
|
1630
2604
|
|
|
1631
|
-
template<>
|
|
1632
|
-
{
|
|
2605
|
+
template <>
|
|
2606
|
+
EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
|
|
1633
2607
|
Packet8s pair, quad, octo;
|
|
1634
|
-
|
|
1635
|
-
//pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
|
|
1636
|
-
pair = vec_min(a, vec_sld(a, a, 8));
|
|
1637
2608
|
|
|
1638
|
-
//
|
|
2609
|
+
// pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
|
|
2610
|
+
pair = vec_min(a, vec_sld(a, a, 8));
|
|
2611
|
+
|
|
2612
|
+
// quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
|
|
1639
2613
|
quad = vec_min(pair, vec_sld(pair, pair, 4));
|
|
1640
2614
|
|
|
1641
|
-
//octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
2615
|
+
// octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
1642
2616
|
octo = vec_min(quad, vec_sld(quad, quad, 2));
|
|
1643
2617
|
return pfirst(octo);
|
|
1644
2618
|
}
|
|
1645
2619
|
|
|
1646
|
-
template<>
|
|
1647
|
-
{
|
|
2620
|
+
template <>
|
|
2621
|
+
EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
|
|
1648
2622
|
Packet8us pair, quad, octo;
|
|
1649
|
-
|
|
1650
|
-
//pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
|
|
1651
|
-
pair = vec_min(a, vec_sld(a, a, 8));
|
|
1652
2623
|
|
|
1653
|
-
//
|
|
2624
|
+
// pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
|
|
2625
|
+
pair = vec_min(a, vec_sld(a, a, 8));
|
|
2626
|
+
|
|
2627
|
+
// quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
|
|
1654
2628
|
quad = vec_min(pair, vec_sld(pair, pair, 4));
|
|
1655
2629
|
|
|
1656
|
-
//octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
2630
|
+
// octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
1657
2631
|
octo = vec_min(quad, vec_sld(quad, quad, 2));
|
|
1658
2632
|
return pfirst(octo);
|
|
1659
2633
|
}
|
|
1660
2634
|
|
|
1661
|
-
template<>
|
|
1662
|
-
{
|
|
2635
|
+
template <>
|
|
2636
|
+
EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
|
|
1663
2637
|
Packet16c pair, quad, octo, result;
|
|
1664
2638
|
|
|
1665
2639
|
pair = vec_min(a, vec_sld(a, a, 8));
|
|
@@ -1670,8 +2644,8 @@ template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c
|
|
|
1670
2644
|
return pfirst(result);
|
|
1671
2645
|
}
|
|
1672
2646
|
|
|
1673
|
-
template<>
|
|
1674
|
-
{
|
|
2647
|
+
template <>
|
|
2648
|
+
EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
|
|
1675
2649
|
Packet16uc pair, quad, octo, result;
|
|
1676
2650
|
|
|
1677
2651
|
pair = vec_min(a, vec_sld(a, a, 8));
|
|
@@ -1682,64 +2656,64 @@ template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet
|
|
|
1682
2656
|
return pfirst(result);
|
|
1683
2657
|
}
|
|
1684
2658
|
// max
|
|
1685
|
-
template<typename Packet>
|
|
1686
|
-
{
|
|
2659
|
+
template <typename Packet>
|
|
2660
|
+
EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
|
|
1687
2661
|
Packet b, res;
|
|
1688
2662
|
b = vec_max(a, vec_sld(a, a, 8));
|
|
1689
2663
|
res = vec_max(b, vec_sld(b, b, 4));
|
|
1690
2664
|
return pfirst(res);
|
|
1691
2665
|
}
|
|
1692
2666
|
|
|
1693
|
-
template<>
|
|
1694
|
-
{
|
|
2667
|
+
template <>
|
|
2668
|
+
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
|
|
1695
2669
|
return predux_max4<Packet4f>(a);
|
|
1696
2670
|
}
|
|
1697
2671
|
|
|
1698
|
-
template<>
|
|
1699
|
-
{
|
|
2672
|
+
template <>
|
|
2673
|
+
EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
|
|
1700
2674
|
return predux_max4<Packet4i>(a);
|
|
1701
2675
|
}
|
|
1702
2676
|
|
|
1703
|
-
template<>
|
|
1704
|
-
{
|
|
2677
|
+
template <>
|
|
2678
|
+
EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
|
|
1705
2679
|
float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
|
|
1706
|
-
float redux_odd
|
|
2680
|
+
float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
|
|
1707
2681
|
float f32_result = (std::max)(redux_even, redux_odd);
|
|
1708
2682
|
return bfloat16(f32_result);
|
|
1709
2683
|
}
|
|
1710
2684
|
|
|
1711
|
-
template<>
|
|
1712
|
-
{
|
|
2685
|
+
template <>
|
|
2686
|
+
EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
|
|
1713
2687
|
Packet8s pair, quad, octo;
|
|
1714
|
-
|
|
1715
|
-
//pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
|
|
1716
|
-
pair = vec_max(a, vec_sld(a, a, 8));
|
|
1717
2688
|
|
|
1718
|
-
//
|
|
2689
|
+
// pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
|
|
2690
|
+
pair = vec_max(a, vec_sld(a, a, 8));
|
|
2691
|
+
|
|
2692
|
+
// quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
|
|
1719
2693
|
quad = vec_max(pair, vec_sld(pair, pair, 4));
|
|
1720
2694
|
|
|
1721
|
-
//octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
2695
|
+
// octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
1722
2696
|
octo = vec_max(quad, vec_sld(quad, quad, 2));
|
|
1723
2697
|
return pfirst(octo);
|
|
1724
2698
|
}
|
|
1725
2699
|
|
|
1726
|
-
template<>
|
|
1727
|
-
{
|
|
2700
|
+
template <>
|
|
2701
|
+
EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
|
|
1728
2702
|
Packet8us pair, quad, octo;
|
|
1729
|
-
|
|
1730
|
-
//pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
|
|
1731
|
-
pair = vec_max(a, vec_sld(a, a, 8));
|
|
1732
2703
|
|
|
1733
|
-
//
|
|
2704
|
+
// pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
|
|
2705
|
+
pair = vec_max(a, vec_sld(a, a, 8));
|
|
2706
|
+
|
|
2707
|
+
// quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
|
|
1734
2708
|
quad = vec_max(pair, vec_sld(pair, pair, 4));
|
|
1735
2709
|
|
|
1736
|
-
//octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
2710
|
+
// octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
|
|
1737
2711
|
octo = vec_max(quad, vec_sld(quad, quad, 2));
|
|
1738
2712
|
return pfirst(octo);
|
|
1739
2713
|
}
|
|
1740
2714
|
|
|
1741
|
-
template<>
|
|
1742
|
-
{
|
|
2715
|
+
template <>
|
|
2716
|
+
EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
|
|
1743
2717
|
Packet16c pair, quad, octo, result;
|
|
1744
2718
|
|
|
1745
2719
|
pair = vec_max(a, vec_sld(a, a, 8));
|
|
@@ -1750,8 +2724,8 @@ template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c
|
|
|
1750
2724
|
return pfirst(result);
|
|
1751
2725
|
}
|
|
1752
2726
|
|
|
1753
|
-
template<>
|
|
1754
|
-
{
|
|
2727
|
+
template <>
|
|
2728
|
+
EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
|
|
1755
2729
|
Packet16uc pair, quad, octo, result;
|
|
1756
2730
|
|
|
1757
2731
|
pair = vec_max(a, vec_sld(a, a, 8));
|
|
@@ -1762,13 +2736,13 @@ template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet
|
|
|
1762
2736
|
return pfirst(result);
|
|
1763
2737
|
}
|
|
1764
2738
|
|
|
1765
|
-
template<>
|
|
1766
|
-
{
|
|
2739
|
+
template <>
|
|
2740
|
+
EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
|
|
1767
2741
|
return vec_any_ne(x, pzero(x));
|
|
1768
2742
|
}
|
|
1769
2743
|
|
|
1770
|
-
template <typename T>
|
|
1771
|
-
ptranpose_common(PacketBlock<T,4>& kernel){
|
|
2744
|
+
template <typename T>
|
|
2745
|
+
EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
|
|
1772
2746
|
T t0, t1, t2, t3;
|
|
1773
2747
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1774
2748
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
@@ -1780,18 +2754,11 @@ ptranpose_common(PacketBlock<T,4>& kernel){
|
|
|
1780
2754
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1781
2755
|
}
|
|
1782
2756
|
|
|
1783
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1784
|
-
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
|
1785
|
-
ptranpose_common<Packet4f>(kernel);
|
|
1786
|
-
}
|
|
2757
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
|
|
1787
2758
|
|
|
1788
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1789
|
-
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
|
1790
|
-
ptranpose_common<Packet4i>(kernel);
|
|
1791
|
-
}
|
|
2759
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
|
|
1792
2760
|
|
|
1793
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1794
|
-
ptranspose(PacketBlock<Packet8s,4>& kernel) {
|
|
2761
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
|
|
1795
2762
|
Packet8s t0, t1, t2, t3;
|
|
1796
2763
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1797
2764
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
@@ -1803,8 +2770,7 @@ ptranspose(PacketBlock<Packet8s,4>& kernel) {
|
|
|
1803
2770
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1804
2771
|
}
|
|
1805
2772
|
|
|
1806
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1807
|
-
ptranspose(PacketBlock<Packet8us,4>& kernel) {
|
|
2773
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
|
|
1808
2774
|
Packet8us t0, t1, t2, t3;
|
|
1809
2775
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1810
2776
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
@@ -1816,9 +2782,7 @@ ptranspose(PacketBlock<Packet8us,4>& kernel) {
|
|
|
1816
2782
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1817
2783
|
}
|
|
1818
2784
|
|
|
1819
|
-
|
|
1820
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1821
|
-
ptranspose(PacketBlock<Packet8bf,4>& kernel) {
|
|
2785
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
|
|
1822
2786
|
Packet8us t0, t1, t2, t3;
|
|
1823
2787
|
|
|
1824
2788
|
t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
|
|
@@ -1831,8 +2795,7 @@ ptranspose(PacketBlock<Packet8bf,4>& kernel) {
|
|
|
1831
2795
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1832
2796
|
}
|
|
1833
2797
|
|
|
1834
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1835
|
-
ptranspose(PacketBlock<Packet16c,4>& kernel) {
|
|
2798
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
|
|
1836
2799
|
Packet16c t0, t1, t2, t3;
|
|
1837
2800
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1838
2801
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
@@ -1844,9 +2807,7 @@ ptranspose(PacketBlock<Packet16c,4>& kernel) {
|
|
|
1844
2807
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1845
2808
|
}
|
|
1846
2809
|
|
|
1847
|
-
|
|
1848
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1849
|
-
ptranspose(PacketBlock<Packet16uc,4>& kernel) {
|
|
2810
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
|
|
1850
2811
|
Packet16uc t0, t1, t2, t3;
|
|
1851
2812
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1852
2813
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
@@ -1858,8 +2819,7 @@ ptranspose(PacketBlock<Packet16uc,4>& kernel) {
|
|
|
1858
2819
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1859
2820
|
}
|
|
1860
2821
|
|
|
1861
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1862
|
-
ptranspose(PacketBlock<Packet8s,8>& kernel) {
|
|
2822
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
|
|
1863
2823
|
Packet8s v[8], sum[8];
|
|
1864
2824
|
|
|
1865
2825
|
v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
|
|
@@ -1889,8 +2849,7 @@ ptranspose(PacketBlock<Packet8s,8>& kernel) {
|
|
|
1889
2849
|
kernel.packet[7] = vec_mergel(sum[3], sum[7]);
|
|
1890
2850
|
}
|
|
1891
2851
|
|
|
1892
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1893
|
-
ptranspose(PacketBlock<Packet8us,8>& kernel) {
|
|
2852
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
|
|
1894
2853
|
Packet8us v[8], sum[8];
|
|
1895
2854
|
|
|
1896
2855
|
v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
|
|
@@ -1920,8 +2879,7 @@ ptranspose(PacketBlock<Packet8us,8>& kernel) {
|
|
|
1920
2879
|
kernel.packet[7] = vec_mergel(sum[3], sum[7]);
|
|
1921
2880
|
}
|
|
1922
2881
|
|
|
1923
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1924
|
-
ptranspose(PacketBlock<Packet8bf,8>& kernel) {
|
|
2882
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
|
|
1925
2883
|
Packet8bf v[8], sum[8];
|
|
1926
2884
|
|
|
1927
2885
|
v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
|
|
@@ -1951,8 +2909,7 @@ ptranspose(PacketBlock<Packet8bf,8>& kernel) {
|
|
|
1951
2909
|
kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
|
|
1952
2910
|
}
|
|
1953
2911
|
|
|
1954
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1955
|
-
ptranspose(PacketBlock<Packet16c,16>& kernel) {
|
|
2912
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
|
|
1956
2913
|
Packet16c step1[16], step2[16], step3[16];
|
|
1957
2914
|
|
|
1958
2915
|
step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
|
|
@@ -1972,16 +2929,16 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
|
|
|
1972
2929
|
step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
|
|
1973
2930
|
step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
|
|
1974
2931
|
|
|
1975
|
-
step2[0]
|
|
1976
|
-
step2[1]
|
|
1977
|
-
step2[2]
|
|
1978
|
-
step2[3]
|
|
1979
|
-
step2[4]
|
|
1980
|
-
step2[5]
|
|
1981
|
-
step2[6]
|
|
1982
|
-
step2[7]
|
|
1983
|
-
step2[8]
|
|
1984
|
-
step2[9]
|
|
2932
|
+
step2[0] = vec_mergeh(step1[0], step1[8]);
|
|
2933
|
+
step2[1] = vec_mergel(step1[0], step1[8]);
|
|
2934
|
+
step2[2] = vec_mergeh(step1[1], step1[9]);
|
|
2935
|
+
step2[3] = vec_mergel(step1[1], step1[9]);
|
|
2936
|
+
step2[4] = vec_mergeh(step1[2], step1[10]);
|
|
2937
|
+
step2[5] = vec_mergel(step1[2], step1[10]);
|
|
2938
|
+
step2[6] = vec_mergeh(step1[3], step1[11]);
|
|
2939
|
+
step2[7] = vec_mergel(step1[3], step1[11]);
|
|
2940
|
+
step2[8] = vec_mergeh(step1[4], step1[12]);
|
|
2941
|
+
step2[9] = vec_mergel(step1[4], step1[12]);
|
|
1985
2942
|
step2[10] = vec_mergeh(step1[5], step1[13]);
|
|
1986
2943
|
step2[11] = vec_mergel(step1[5], step1[13]);
|
|
1987
2944
|
step2[12] = vec_mergeh(step1[6], step1[14]);
|
|
@@ -1989,16 +2946,16 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
|
|
|
1989
2946
|
step2[14] = vec_mergeh(step1[7], step1[15]);
|
|
1990
2947
|
step2[15] = vec_mergel(step1[7], step1[15]);
|
|
1991
2948
|
|
|
1992
|
-
step3[0]
|
|
1993
|
-
step3[1]
|
|
1994
|
-
step3[2]
|
|
1995
|
-
step3[3]
|
|
1996
|
-
step3[4]
|
|
1997
|
-
step3[5]
|
|
1998
|
-
step3[6]
|
|
1999
|
-
step3[7]
|
|
2000
|
-
step3[8]
|
|
2001
|
-
step3[9]
|
|
2949
|
+
step3[0] = vec_mergeh(step2[0], step2[8]);
|
|
2950
|
+
step3[1] = vec_mergel(step2[0], step2[8]);
|
|
2951
|
+
step3[2] = vec_mergeh(step2[1], step2[9]);
|
|
2952
|
+
step3[3] = vec_mergel(step2[1], step2[9]);
|
|
2953
|
+
step3[4] = vec_mergeh(step2[2], step2[10]);
|
|
2954
|
+
step3[5] = vec_mergel(step2[2], step2[10]);
|
|
2955
|
+
step3[6] = vec_mergeh(step2[3], step2[11]);
|
|
2956
|
+
step3[7] = vec_mergel(step2[3], step2[11]);
|
|
2957
|
+
step3[8] = vec_mergeh(step2[4], step2[12]);
|
|
2958
|
+
step3[9] = vec_mergel(step2[4], step2[12]);
|
|
2002
2959
|
step3[10] = vec_mergeh(step2[5], step2[13]);
|
|
2003
2960
|
step3[11] = vec_mergel(step2[5], step2[13]);
|
|
2004
2961
|
step3[12] = vec_mergeh(step2[6], step2[14]);
|
|
@@ -2006,16 +2963,16 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
|
|
|
2006
2963
|
step3[14] = vec_mergeh(step2[7], step2[15]);
|
|
2007
2964
|
step3[15] = vec_mergel(step2[7], step2[15]);
|
|
2008
2965
|
|
|
2009
|
-
kernel.packet[0]
|
|
2010
|
-
kernel.packet[1]
|
|
2011
|
-
kernel.packet[2]
|
|
2012
|
-
kernel.packet[3]
|
|
2013
|
-
kernel.packet[4]
|
|
2014
|
-
kernel.packet[5]
|
|
2015
|
-
kernel.packet[6]
|
|
2016
|
-
kernel.packet[7]
|
|
2017
|
-
kernel.packet[8]
|
|
2018
|
-
kernel.packet[9]
|
|
2966
|
+
kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
|
|
2967
|
+
kernel.packet[1] = vec_mergel(step3[0], step3[8]);
|
|
2968
|
+
kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
|
|
2969
|
+
kernel.packet[3] = vec_mergel(step3[1], step3[9]);
|
|
2970
|
+
kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
|
|
2971
|
+
kernel.packet[5] = vec_mergel(step3[2], step3[10]);
|
|
2972
|
+
kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
|
|
2973
|
+
kernel.packet[7] = vec_mergel(step3[3], step3[11]);
|
|
2974
|
+
kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
|
|
2975
|
+
kernel.packet[9] = vec_mergel(step3[4], step3[12]);
|
|
2019
2976
|
kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
|
|
2020
2977
|
kernel.packet[11] = vec_mergel(step3[5], step3[13]);
|
|
2021
2978
|
kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
|
|
@@ -2024,8 +2981,7 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
|
|
|
2024
2981
|
kernel.packet[15] = vec_mergel(step3[7], step3[15]);
|
|
2025
2982
|
}
|
|
2026
2983
|
|
|
2027
|
-
EIGEN_DEVICE_FUNC inline void
|
|
2028
|
-
ptranspose(PacketBlock<Packet16uc,16>& kernel) {
|
|
2984
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
|
|
2029
2985
|
Packet16uc step1[16], step2[16], step3[16];
|
|
2030
2986
|
|
|
2031
2987
|
step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
|
|
@@ -2045,16 +3001,16 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
|
|
|
2045
3001
|
step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
|
|
2046
3002
|
step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
|
|
2047
3003
|
|
|
2048
|
-
step2[0]
|
|
2049
|
-
step2[1]
|
|
2050
|
-
step2[2]
|
|
2051
|
-
step2[3]
|
|
2052
|
-
step2[4]
|
|
2053
|
-
step2[5]
|
|
2054
|
-
step2[6]
|
|
2055
|
-
step2[7]
|
|
2056
|
-
step2[8]
|
|
2057
|
-
step2[9]
|
|
3004
|
+
step2[0] = vec_mergeh(step1[0], step1[8]);
|
|
3005
|
+
step2[1] = vec_mergel(step1[0], step1[8]);
|
|
3006
|
+
step2[2] = vec_mergeh(step1[1], step1[9]);
|
|
3007
|
+
step2[3] = vec_mergel(step1[1], step1[9]);
|
|
3008
|
+
step2[4] = vec_mergeh(step1[2], step1[10]);
|
|
3009
|
+
step2[5] = vec_mergel(step1[2], step1[10]);
|
|
3010
|
+
step2[6] = vec_mergeh(step1[3], step1[11]);
|
|
3011
|
+
step2[7] = vec_mergel(step1[3], step1[11]);
|
|
3012
|
+
step2[8] = vec_mergeh(step1[4], step1[12]);
|
|
3013
|
+
step2[9] = vec_mergel(step1[4], step1[12]);
|
|
2058
3014
|
step2[10] = vec_mergeh(step1[5], step1[13]);
|
|
2059
3015
|
step2[11] = vec_mergel(step1[5], step1[13]);
|
|
2060
3016
|
step2[12] = vec_mergeh(step1[6], step1[14]);
|
|
@@ -2062,16 +3018,16 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
|
|
|
2062
3018
|
step2[14] = vec_mergeh(step1[7], step1[15]);
|
|
2063
3019
|
step2[15] = vec_mergel(step1[7], step1[15]);
|
|
2064
3020
|
|
|
2065
|
-
step3[0]
|
|
2066
|
-
step3[1]
|
|
2067
|
-
step3[2]
|
|
2068
|
-
step3[3]
|
|
2069
|
-
step3[4]
|
|
2070
|
-
step3[5]
|
|
2071
|
-
step3[6]
|
|
2072
|
-
step3[7]
|
|
2073
|
-
step3[8]
|
|
2074
|
-
step3[9]
|
|
3021
|
+
step3[0] = vec_mergeh(step2[0], step2[8]);
|
|
3022
|
+
step3[1] = vec_mergel(step2[0], step2[8]);
|
|
3023
|
+
step3[2] = vec_mergeh(step2[1], step2[9]);
|
|
3024
|
+
step3[3] = vec_mergel(step2[1], step2[9]);
|
|
3025
|
+
step3[4] = vec_mergeh(step2[2], step2[10]);
|
|
3026
|
+
step3[5] = vec_mergel(step2[2], step2[10]);
|
|
3027
|
+
step3[6] = vec_mergeh(step2[3], step2[11]);
|
|
3028
|
+
step3[7] = vec_mergel(step2[3], step2[11]);
|
|
3029
|
+
step3[8] = vec_mergeh(step2[4], step2[12]);
|
|
3030
|
+
step3[9] = vec_mergel(step2[4], step2[12]);
|
|
2075
3031
|
step3[10] = vec_mergeh(step2[5], step2[13]);
|
|
2076
3032
|
step3[11] = vec_mergel(step2[5], step2[13]);
|
|
2077
3033
|
step3[12] = vec_mergeh(step2[6], step2[14]);
|
|
@@ -2079,16 +3035,16 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
|
|
|
2079
3035
|
step3[14] = vec_mergeh(step2[7], step2[15]);
|
|
2080
3036
|
step3[15] = vec_mergel(step2[7], step2[15]);
|
|
2081
3037
|
|
|
2082
|
-
kernel.packet[0]
|
|
2083
|
-
kernel.packet[1]
|
|
2084
|
-
kernel.packet[2]
|
|
2085
|
-
kernel.packet[3]
|
|
2086
|
-
kernel.packet[4]
|
|
2087
|
-
kernel.packet[5]
|
|
2088
|
-
kernel.packet[6]
|
|
2089
|
-
kernel.packet[7]
|
|
2090
|
-
kernel.packet[8]
|
|
2091
|
-
kernel.packet[9]
|
|
3038
|
+
kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
|
|
3039
|
+
kernel.packet[1] = vec_mergel(step3[0], step3[8]);
|
|
3040
|
+
kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
|
|
3041
|
+
kernel.packet[3] = vec_mergel(step3[1], step3[9]);
|
|
3042
|
+
kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
|
|
3043
|
+
kernel.packet[5] = vec_mergel(step3[2], step3[10]);
|
|
3044
|
+
kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
|
|
3045
|
+
kernel.packet[7] = vec_mergel(step3[3], step3[11]);
|
|
3046
|
+
kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
|
|
3047
|
+
kernel.packet[9] = vec_mergel(step3[4], step3[12]);
|
|
2092
3048
|
kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
|
|
2093
3049
|
kernel.packet[11] = vec_mergel(step3[5], step3[13]);
|
|
2094
3050
|
kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
|
|
@@ -2097,229 +3053,173 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
|
|
|
2097
3053
|
kernel.packet[15] = vec_mergel(step3[7], step3[15]);
|
|
2098
3054
|
}
|
|
2099
3055
|
|
|
2100
|
-
template<typename Packet>
|
|
2101
|
-
Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
|
|
2102
|
-
Packet4ui select = {
|
|
2103
|
-
Packet4ui mask = reinterpret_cast<Packet4ui>(
|
|
3056
|
+
template <typename Packet>
|
|
3057
|
+
EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
|
|
3058
|
+
Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
|
|
3059
|
+
Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
|
|
2104
3060
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
2105
3061
|
}
|
|
2106
3062
|
|
|
2107
|
-
template<>
|
|
3063
|
+
template <>
|
|
3064
|
+
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
|
|
3065
|
+
const Packet4i& elsePacket) {
|
|
2108
3066
|
return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
|
|
2109
3067
|
}
|
|
2110
3068
|
|
|
2111
|
-
template<>
|
|
3069
|
+
template <>
|
|
3070
|
+
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
|
|
3071
|
+
const Packet4f& elsePacket) {
|
|
2112
3072
|
return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
|
|
2113
3073
|
}
|
|
2114
3074
|
|
|
2115
|
-
template<>
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
Packet8us
|
|
3075
|
+
template <>
|
|
3076
|
+
EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
|
|
3077
|
+
const Packet8s& elsePacket) {
|
|
3078
|
+
Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
|
3079
|
+
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
|
|
3080
|
+
Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
|
|
2119
3081
|
Packet8s result = vec_sel(elsePacket, thenPacket, mask);
|
|
2120
3082
|
return result;
|
|
2121
3083
|
}
|
|
2122
3084
|
|
|
2123
|
-
template<>
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
Packet8us
|
|
3085
|
+
template <>
|
|
3086
|
+
EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
|
|
3087
|
+
const Packet8us& elsePacket) {
|
|
3088
|
+
Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
|
3089
|
+
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
|
|
3090
|
+
Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
|
|
2127
3091
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
2128
3092
|
}
|
|
2129
3093
|
|
|
2130
|
-
template<>
|
|
3094
|
+
template <>
|
|
3095
|
+
EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
|
|
3096
|
+
const Packet8bf& elsePacket) {
|
|
2131
3097
|
return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
|
|
2132
3098
|
}
|
|
2133
3099
|
|
|
2134
|
-
template<>
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
ifPacket.select[
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
|
|
2145
|
-
Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
|
2146
|
-
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
|
|
2147
|
-
ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
|
|
2148
|
-
ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
|
|
2149
|
-
|
|
2150
|
-
Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
|
|
3100
|
+
template <>
|
|
3101
|
+
EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
|
|
3102
|
+
const Packet16c& elsePacket) {
|
|
3103
|
+
Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
|
3104
|
+
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
|
|
3105
|
+
ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
|
|
3106
|
+
ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
|
|
3107
|
+
|
|
3108
|
+
Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
|
|
2151
3109
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
2152
3110
|
}
|
|
2153
3111
|
|
|
2154
3112
|
template <>
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
struct type_casting_traits<int, float> {
|
|
2165
|
-
enum {
|
|
2166
|
-
VectorizedCast = 1,
|
|
2167
|
-
SrcCoeffRatio = 1,
|
|
2168
|
-
TgtCoeffRatio = 1
|
|
2169
|
-
};
|
|
2170
|
-
};
|
|
2171
|
-
|
|
2172
|
-
template <>
|
|
2173
|
-
struct type_casting_traits<bfloat16, unsigned short int> {
|
|
2174
|
-
enum {
|
|
2175
|
-
VectorizedCast = 1,
|
|
2176
|
-
SrcCoeffRatio = 1,
|
|
2177
|
-
TgtCoeffRatio = 1
|
|
2178
|
-
};
|
|
2179
|
-
};
|
|
2180
|
-
|
|
2181
|
-
template <>
|
|
2182
|
-
struct type_casting_traits<unsigned short int, bfloat16> {
|
|
2183
|
-
enum {
|
|
2184
|
-
VectorizedCast = 1,
|
|
2185
|
-
SrcCoeffRatio = 1,
|
|
2186
|
-
TgtCoeffRatio = 1
|
|
2187
|
-
};
|
|
2188
|
-
};
|
|
2189
|
-
|
|
2190
|
-
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
|
|
2191
|
-
return vec_cts(a,0);
|
|
2192
|
-
}
|
|
2193
|
-
|
|
2194
|
-
template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
|
|
2195
|
-
return vec_ctu(a,0);
|
|
2196
|
-
}
|
|
2197
|
-
|
|
2198
|
-
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
|
|
2199
|
-
return vec_ctf(a,0);
|
|
2200
|
-
}
|
|
2201
|
-
|
|
2202
|
-
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
|
|
2203
|
-
return vec_ctf(a,0);
|
|
2204
|
-
}
|
|
2205
|
-
|
|
2206
|
-
template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
|
|
2207
|
-
Packet4f float_even = Bf16ToF32Even(a);
|
|
2208
|
-
Packet4f float_odd = Bf16ToF32Odd(a);
|
|
2209
|
-
Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
|
|
2210
|
-
Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
|
|
2211
|
-
const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
|
|
2212
|
-
Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
|
|
2213
|
-
Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
|
|
2214
|
-
|
|
2215
|
-
//Check values that are bigger than USHRT_MAX (0xFFFF)
|
|
2216
|
-
Packet4bi overflow_selector;
|
|
2217
|
-
if(vec_any_gt(int_even, p4ui_low_mask)){
|
|
2218
|
-
overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
|
|
2219
|
-
low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
|
|
2220
|
-
}
|
|
2221
|
-
if(vec_any_gt(int_odd, p4ui_low_mask)){
|
|
2222
|
-
overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
|
|
2223
|
-
low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
|
|
2224
|
-
}
|
|
2225
|
-
|
|
2226
|
-
low_odd = plogical_shift_left<16>(low_odd);
|
|
2227
|
-
|
|
2228
|
-
Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
|
|
2229
|
-
return reinterpret_cast<Packet8us>(int_final);
|
|
2230
|
-
}
|
|
2231
|
-
|
|
2232
|
-
template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
|
|
2233
|
-
//short -> int -> float -> bfloat16
|
|
2234
|
-
const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
|
|
2235
|
-
Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
|
|
2236
|
-
Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
|
|
2237
|
-
Packet4ui int_odd = plogical_shift_right<16>(int_cast);
|
|
2238
|
-
Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
|
|
2239
|
-
Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
|
|
2240
|
-
return F32ToBf16(float_even, float_odd);
|
|
2241
|
-
}
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
|
|
2245
|
-
return reinterpret_cast<Packet4i>(a);
|
|
2246
|
-
}
|
|
2247
|
-
|
|
2248
|
-
template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
|
|
2249
|
-
return reinterpret_cast<Packet4f>(a);
|
|
3113
|
+
EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
|
|
3114
|
+
const Packet16uc& elsePacket) {
|
|
3115
|
+
Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
|
|
3116
|
+
ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
|
|
3117
|
+
ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
|
|
3118
|
+
ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
|
|
3119
|
+
|
|
3120
|
+
Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
|
|
3121
|
+
return vec_sel(elsePacket, thenPacket, mask);
|
|
2250
3122
|
}
|
|
2251
3123
|
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
3124
|
//---------- double ----------
|
|
2255
|
-
#ifdef
|
|
2256
|
-
typedef __vector double
|
|
2257
|
-
typedef __vector unsigned long long
|
|
2258
|
-
typedef __vector long long
|
|
3125
|
+
#ifdef EIGEN_VECTORIZE_VSX
|
|
3126
|
+
typedef __vector double Packet2d;
|
|
3127
|
+
typedef __vector unsigned long long Packet2ul;
|
|
3128
|
+
typedef __vector long long Packet2l;
|
|
2259
3129
|
#if EIGEN_COMP_CLANG
|
|
2260
|
-
typedef Packet2ul
|
|
3130
|
+
typedef Packet2ul Packet2bl;
|
|
2261
3131
|
#else
|
|
2262
|
-
typedef __vector __bool long
|
|
3132
|
+
typedef __vector __bool long Packet2bl;
|
|
2263
3133
|
#endif
|
|
2264
3134
|
|
|
2265
|
-
static Packet2l
|
|
2266
|
-
static
|
|
2267
|
-
static Packet2ul
|
|
2268
|
-
static
|
|
2269
|
-
static Packet2d
|
|
2270
|
-
static Packet2d
|
|
2271
|
-
|
|
2272
|
-
numext::bit_cast<double>(0x8000000000000000ull) };
|
|
3135
|
+
static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
|
|
3136
|
+
static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
|
|
3137
|
+
static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
|
|
3138
|
+
static Packet2d p2d_ONE = {1.0, 1.0};
|
|
3139
|
+
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
|
|
3140
|
+
static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
|
|
3141
|
+
numext::bit_cast<double>(0x8000000000000000ull)};
|
|
2273
3142
|
|
|
2274
3143
|
#ifdef _BIG_ENDIAN
|
|
2275
|
-
static Packet2d p2d_COUNTDOWN =
|
|
3144
|
+
static Packet2d p2d_COUNTDOWN =
|
|
3145
|
+
reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
|
|
2276
3146
|
#else
|
|
2277
|
-
static Packet2d p2d_COUNTDOWN =
|
|
3147
|
+
static Packet2d p2d_COUNTDOWN =
|
|
3148
|
+
reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
|
|
2278
3149
|
#endif
|
|
2279
3150
|
|
|
2280
|
-
template<int index>
|
|
2281
|
-
{
|
|
3151
|
+
template <int index>
|
|
3152
|
+
Packet2d vec_splat_dbl(Packet2d& a) {
|
|
2282
3153
|
return vec_splat(a, index);
|
|
2283
3154
|
}
|
|
2284
3155
|
|
|
2285
|
-
template<>
|
|
2286
|
-
{
|
|
3156
|
+
template <>
|
|
3157
|
+
struct packet_traits<double> : default_packet_traits {
|
|
2287
3158
|
typedef Packet2d type;
|
|
2288
3159
|
typedef Packet2d half;
|
|
2289
3160
|
enum {
|
|
2290
3161
|
Vectorizable = 1,
|
|
2291
3162
|
AlignedOnScalar = 1,
|
|
2292
|
-
size=2,
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
3163
|
+
size = 2,
|
|
3164
|
+
|
|
3165
|
+
HasAdd = 1,
|
|
3166
|
+
HasSub = 1,
|
|
3167
|
+
HasMul = 1,
|
|
3168
|
+
HasDiv = 1,
|
|
3169
|
+
HasMin = 1,
|
|
3170
|
+
HasMax = 1,
|
|
3171
|
+
HasAbs = 1,
|
|
3172
|
+
HasSin = EIGEN_FAST_MATH,
|
|
3173
|
+
HasCos = EIGEN_FAST_MATH,
|
|
3174
|
+
HasTanh = EIGEN_FAST_MATH,
|
|
3175
|
+
HasErf = EIGEN_FAST_MATH,
|
|
3176
|
+
HasErfc = EIGEN_FAST_MATH,
|
|
3177
|
+
HasATanh = 1,
|
|
3178
|
+
HasATan = 0,
|
|
3179
|
+
HasLog = 0,
|
|
3180
|
+
HasCmp = 1,
|
|
3181
|
+
HasExp = 1,
|
|
2306
3182
|
HasSqrt = 1,
|
|
3183
|
+
HasCbrt = 1,
|
|
3184
|
+
#if !EIGEN_COMP_CLANG
|
|
2307
3185
|
HasRsqrt = 1,
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
HasRint = 1,
|
|
3186
|
+
#else
|
|
3187
|
+
HasRsqrt = 0,
|
|
3188
|
+
#endif
|
|
2312
3189
|
HasNegate = 1,
|
|
2313
3190
|
HasBlend = 1
|
|
2314
3191
|
};
|
|
2315
3192
|
};
|
|
2316
3193
|
|
|
2317
|
-
template<>
|
|
3194
|
+
template <>
|
|
3195
|
+
struct unpacket_traits<Packet2d> {
|
|
3196
|
+
typedef double type;
|
|
3197
|
+
typedef Packet2l integer_packet;
|
|
3198
|
+
enum {
|
|
3199
|
+
size = 2,
|
|
3200
|
+
alignment = Aligned16,
|
|
3201
|
+
vectorizable = true,
|
|
3202
|
+
masked_load_available = false,
|
|
3203
|
+
masked_store_available = false
|
|
3204
|
+
};
|
|
3205
|
+
typedef Packet2d half;
|
|
3206
|
+
};
|
|
3207
|
+
template <>
|
|
3208
|
+
struct unpacket_traits<Packet2l> {
|
|
3209
|
+
typedef int64_t type;
|
|
3210
|
+
typedef Packet2l half;
|
|
3211
|
+
enum {
|
|
3212
|
+
size = 2,
|
|
3213
|
+
alignment = Aligned16,
|
|
3214
|
+
vectorizable = false,
|
|
3215
|
+
masked_load_available = false,
|
|
3216
|
+
masked_store_available = false
|
|
3217
|
+
};
|
|
3218
|
+
};
|
|
2318
3219
|
|
|
2319
|
-
inline std::ostream
|
|
2320
|
-
{
|
|
3220
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
|
|
2321
3221
|
union {
|
|
2322
|
-
Packet2l
|
|
3222
|
+
Packet2l v;
|
|
2323
3223
|
int64_t n[2];
|
|
2324
3224
|
} vt;
|
|
2325
3225
|
vt.v = v;
|
|
@@ -2327,10 +3227,9 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
|
|
|
2327
3227
|
return s;
|
|
2328
3228
|
}
|
|
2329
3229
|
|
|
2330
|
-
inline std::ostream
|
|
2331
|
-
{
|
|
3230
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
|
|
2332
3231
|
union {
|
|
2333
|
-
Packet2d
|
|
3232
|
+
Packet2d v;
|
|
2334
3233
|
double n[2];
|
|
2335
3234
|
} vt;
|
|
2336
3235
|
vt.v = v;
|
|
@@ -2339,204 +3238,322 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
|
|
|
2339
3238
|
}
|
|
2340
3239
|
|
|
2341
3240
|
// Need to define them first or we get specialization after instantiation errors
|
|
2342
|
-
template<>
|
|
2343
|
-
{
|
|
3241
|
+
template <>
|
|
3242
|
+
EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
|
|
2344
3243
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
2345
|
-
return vec_xl(0, const_cast<double
|
|
3244
|
+
return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
|
|
3245
|
+
}
|
|
3246
|
+
|
|
3247
|
+
template <>
|
|
3248
|
+
EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
|
|
3249
|
+
return pload_partial_common<Packet2d>(from, n, offset);
|
|
2346
3250
|
}
|
|
2347
3251
|
|
|
2348
|
-
template<>
|
|
2349
|
-
{
|
|
3252
|
+
template <>
|
|
3253
|
+
EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
|
|
2350
3254
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
2351
3255
|
vec_xst(from, 0, to);
|
|
2352
3256
|
}
|
|
2353
3257
|
|
|
2354
|
-
template<>
|
|
3258
|
+
template <>
|
|
3259
|
+
EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
|
|
3260
|
+
pstore_partial_common<Packet2d>(to, from, n, offset);
|
|
3261
|
+
}
|
|
3262
|
+
|
|
3263
|
+
template <>
|
|
3264
|
+
EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
|
2355
3265
|
Packet2d v = {from, from};
|
|
2356
3266
|
return v;
|
|
2357
3267
|
}
|
|
3268
|
+
template <>
|
|
3269
|
+
EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
|
|
3270
|
+
Packet2l v = {from, from};
|
|
3271
|
+
return v;
|
|
3272
|
+
}
|
|
2358
3273
|
|
|
2359
|
-
template<>
|
|
3274
|
+
template <>
|
|
3275
|
+
EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
|
|
2360
3276
|
Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
|
|
2361
3277
|
return reinterpret_cast<Packet2d>(v);
|
|
2362
3278
|
}
|
|
2363
3279
|
|
|
2364
|
-
template<>
|
|
2365
|
-
pbroadcast4<Packet2d>(const double
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
//This way is faster than vec_splat (at least for doubles in Power 9)
|
|
3280
|
+
template <>
|
|
3281
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
|
|
3282
|
+
Packet2d& a3) {
|
|
3283
|
+
// This way is faster than vec_splat (at least for doubles in Power 9)
|
|
2369
3284
|
a0 = pset1<Packet2d>(a[0]);
|
|
2370
3285
|
a1 = pset1<Packet2d>(a[1]);
|
|
2371
3286
|
a2 = pset1<Packet2d>(a[2]);
|
|
2372
3287
|
a3 = pset1<Packet2d>(a[3]);
|
|
2373
3288
|
}
|
|
2374
3289
|
|
|
2375
|
-
template<>
|
|
2376
|
-
{
|
|
2377
|
-
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
3290
|
+
template <>
|
|
3291
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
|
|
3292
|
+
return pgather_common<Packet2d>(from, stride);
|
|
3293
|
+
}
|
|
3294
|
+
template <>
|
|
3295
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
|
|
3296
|
+
const Index n) {
|
|
3297
|
+
return pgather_common<Packet2d>(from, stride, n);
|
|
3298
|
+
}
|
|
3299
|
+
template <>
|
|
3300
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
|
|
3301
|
+
pscatter_common<Packet2d>(to, from, stride);
|
|
2381
3302
|
}
|
|
2382
|
-
template<>
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
to[0*stride] = af[0];
|
|
2387
|
-
to[1*stride] = af[1];
|
|
3303
|
+
template <>
|
|
3304
|
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
|
|
3305
|
+
Index stride, const Index n) {
|
|
3306
|
+
pscatter_common<Packet2d>(to, from, stride, n);
|
|
2388
3307
|
}
|
|
2389
3308
|
|
|
2390
|
-
template<>
|
|
3309
|
+
template <>
|
|
3310
|
+
EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
|
|
3311
|
+
return pset1<Packet2d>(a) + p2d_COUNTDOWN;
|
|
3312
|
+
}
|
|
2391
3313
|
|
|
2392
|
-
template<>
|
|
3314
|
+
template <>
|
|
3315
|
+
EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3316
|
+
return a + b;
|
|
3317
|
+
}
|
|
2393
3318
|
|
|
2394
|
-
template<>
|
|
3319
|
+
template <>
|
|
3320
|
+
EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3321
|
+
return a - b;
|
|
3322
|
+
}
|
|
2395
3323
|
|
|
2396
|
-
template<>
|
|
3324
|
+
template <>
|
|
3325
|
+
EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
|
|
3326
|
+
#ifdef __POWER8_VECTOR__
|
|
3327
|
+
return vec_neg(a);
|
|
3328
|
+
#else
|
|
3329
|
+
return vec_xor(a, p2d_MZERO);
|
|
3330
|
+
#endif
|
|
3331
|
+
}
|
|
2397
3332
|
|
|
2398
|
-
template<>
|
|
3333
|
+
template <>
|
|
3334
|
+
EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
|
|
3335
|
+
return a;
|
|
3336
|
+
}
|
|
2399
3337
|
|
|
2400
|
-
template<>
|
|
2401
|
-
|
|
3338
|
+
template <>
|
|
3339
|
+
EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3340
|
+
return vec_madd(a, b, p2d_MZERO);
|
|
3341
|
+
}
|
|
3342
|
+
template <>
|
|
3343
|
+
EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3344
|
+
return vec_div(a, b);
|
|
3345
|
+
}
|
|
2402
3346
|
|
|
2403
3347
|
// for some weird raisons, it has to be overloaded for packet of integers
|
|
2404
|
-
template<>
|
|
3348
|
+
template <>
|
|
3349
|
+
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
3350
|
+
return vec_madd(a, b, c);
|
|
3351
|
+
}
|
|
3352
|
+
template <>
|
|
3353
|
+
EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
3354
|
+
return vec_msub(a, b, c);
|
|
3355
|
+
}
|
|
3356
|
+
template <>
|
|
3357
|
+
EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
3358
|
+
return vec_nmsub(a, b, c);
|
|
3359
|
+
}
|
|
3360
|
+
template <>
|
|
3361
|
+
EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
3362
|
+
return vec_nmadd(a, b, c);
|
|
3363
|
+
}
|
|
2405
3364
|
|
|
2406
|
-
template<>
|
|
2407
|
-
{
|
|
3365
|
+
template <>
|
|
3366
|
+
EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
2408
3367
|
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
|
|
2409
3368
|
Packet2d ret;
|
|
2410
|
-
__asm__
|
|
3369
|
+
__asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
|
|
2411
3370
|
return ret;
|
|
2412
|
-
|
|
3371
|
+
}
|
|
2413
3372
|
|
|
2414
|
-
template<>
|
|
2415
|
-
{
|
|
3373
|
+
template <>
|
|
3374
|
+
EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
2416
3375
|
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
|
|
2417
3376
|
Packet2d ret;
|
|
2418
|
-
__asm__
|
|
3377
|
+
__asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
|
|
2419
3378
|
return ret;
|
|
2420
3379
|
}
|
|
2421
3380
|
|
|
2422
|
-
template<>
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
3381
|
+
template <>
|
|
3382
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
|
|
3383
|
+
return reinterpret_cast<Packet2d>(vec_cmple(a, b));
|
|
3384
|
+
}
|
|
3385
|
+
template <>
|
|
3386
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
|
|
3387
|
+
return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
|
|
3388
|
+
}
|
|
3389
|
+
template <>
|
|
3390
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
|
|
3391
|
+
return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
|
|
3392
|
+
}
|
|
3393
|
+
template <>
|
|
3394
|
+
#ifdef __POWER8_VECTOR__
|
|
3395
|
+
EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
|
|
3396
|
+
return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
|
|
3397
|
+
}
|
|
3398
|
+
#else
|
|
3399
|
+
EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
|
|
3400
|
+
Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
|
|
3401
|
+
Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
|
|
3402
|
+
return reinterpret_cast<Packet2l>(pand(halves, flipped));
|
|
3403
|
+
}
|
|
3404
|
+
#endif
|
|
3405
|
+
template <>
|
|
3406
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
|
|
3407
|
+
Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
|
|
3408
|
+
return vec_nor(c, c);
|
|
2428
3409
|
}
|
|
2429
3410
|
|
|
2430
|
-
template<>
|
|
3411
|
+
template <>
|
|
3412
|
+
EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3413
|
+
return vec_and(a, b);
|
|
3414
|
+
}
|
|
2431
3415
|
|
|
2432
|
-
template<>
|
|
3416
|
+
template <>
|
|
3417
|
+
EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3418
|
+
return vec_or(a, b);
|
|
3419
|
+
}
|
|
2433
3420
|
|
|
2434
|
-
template<>
|
|
3421
|
+
template <>
|
|
3422
|
+
EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3423
|
+
return vec_xor(a, b);
|
|
3424
|
+
}
|
|
2435
3425
|
|
|
2436
|
-
template<>
|
|
3426
|
+
template <>
|
|
3427
|
+
EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
3428
|
+
return vec_and(a, vec_nor(b, b));
|
|
3429
|
+
}
|
|
2437
3430
|
|
|
2438
|
-
template<>
|
|
2439
|
-
{
|
|
2440
|
-
|
|
2441
|
-
|
|
3431
|
+
template <>
|
|
3432
|
+
EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
|
|
3433
|
+
Packet2d t = vec_add(
|
|
3434
|
+
reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
|
|
3435
|
+
Packet2d res;
|
|
2442
3436
|
|
|
2443
|
-
|
|
2444
|
-
: "=&wa" (res)
|
|
2445
|
-
: "wa" (t));
|
|
3437
|
+
__asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
|
|
2446
3438
|
|
|
2447
|
-
|
|
3439
|
+
return res;
|
|
3440
|
+
}
|
|
3441
|
+
template <>
|
|
3442
|
+
EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
|
|
3443
|
+
return vec_ceil(a);
|
|
3444
|
+
}
|
|
3445
|
+
template <>
|
|
3446
|
+
EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
|
|
3447
|
+
return vec_floor(a);
|
|
3448
|
+
}
|
|
3449
|
+
template <>
|
|
3450
|
+
EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
|
|
3451
|
+
return vec_trunc(a);
|
|
2448
3452
|
}
|
|
2449
|
-
template<>
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
{
|
|
2453
|
-
Packet2d res;
|
|
3453
|
+
template <>
|
|
3454
|
+
EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
|
|
3455
|
+
Packet2d res;
|
|
2454
3456
|
|
|
2455
|
-
|
|
2456
|
-
: "=&wa" (res)
|
|
2457
|
-
: "wa" (a));
|
|
3457
|
+
__asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
|
|
2458
3458
|
|
|
2459
|
-
|
|
3459
|
+
return res;
|
|
2460
3460
|
}
|
|
2461
3461
|
|
|
2462
|
-
template<>
|
|
2463
|
-
{
|
|
3462
|
+
template <>
|
|
3463
|
+
EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
|
|
2464
3464
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
|
2465
3465
|
return vec_xl(0, const_cast<double*>(from));
|
|
2466
3466
|
}
|
|
2467
3467
|
|
|
2468
|
-
template<>
|
|
2469
|
-
{
|
|
3468
|
+
template <>
|
|
3469
|
+
EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
|
|
3470
|
+
return ploadu_partial_common<Packet2d>(from, n, offset);
|
|
3471
|
+
}
|
|
3472
|
+
|
|
3473
|
+
template <>
|
|
3474
|
+
EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
|
|
2470
3475
|
Packet2d p;
|
|
2471
|
-
if((std::ptrdiff_t(from) % 16) == 0)
|
|
2472
|
-
|
|
3476
|
+
if ((std::ptrdiff_t(from) % 16) == 0)
|
|
3477
|
+
p = pload<Packet2d>(from);
|
|
3478
|
+
else
|
|
3479
|
+
p = ploadu<Packet2d>(from);
|
|
2473
3480
|
return vec_splat_dbl<0>(p);
|
|
2474
3481
|
}
|
|
2475
3482
|
|
|
2476
|
-
template<>
|
|
2477
|
-
{
|
|
3483
|
+
template <>
|
|
3484
|
+
EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
|
|
2478
3485
|
EIGEN_DEBUG_UNALIGNED_STORE
|
|
2479
3486
|
vec_xst(from, 0, to);
|
|
2480
3487
|
}
|
|
2481
3488
|
|
|
2482
|
-
template<>
|
|
3489
|
+
template <>
|
|
3490
|
+
EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
|
|
3491
|
+
pstoreu_partial_common<Packet2d>(to, from, n, offset);
|
|
3492
|
+
}
|
|
2483
3493
|
|
|
2484
|
-
template<>
|
|
3494
|
+
template <>
|
|
3495
|
+
EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
|
|
3496
|
+
EIGEN_PPC_PREFETCH(addr);
|
|
3497
|
+
}
|
|
2485
3498
|
|
|
2486
|
-
template<>
|
|
2487
|
-
{
|
|
2488
|
-
|
|
3499
|
+
template <>
|
|
3500
|
+
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
|
3501
|
+
EIGEN_ALIGN16 double x[2];
|
|
3502
|
+
pstore<double>(x, a);
|
|
3503
|
+
return x[0];
|
|
2489
3504
|
}
|
|
2490
|
-
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
|
|
2491
3505
|
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
#
|
|
2501
|
-
|
|
2502
|
-
|
|
3506
|
+
template <>
|
|
3507
|
+
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
|
|
3508
|
+
return vec_sld(a, a, 8);
|
|
3509
|
+
}
|
|
3510
|
+
template <>
|
|
3511
|
+
EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
|
|
3512
|
+
return vec_abs(a);
|
|
3513
|
+
}
|
|
3514
|
+
#ifdef __POWER8_VECTOR__
|
|
3515
|
+
template <>
|
|
3516
|
+
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
|
|
3517
|
+
return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
|
|
3518
|
+
}
|
|
2503
3519
|
#else
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
return l;
|
|
3520
|
+
#ifdef _BIG_ENDIAN
|
|
3521
|
+
static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
|
|
3522
|
+
#else
|
|
3523
|
+
static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
|
|
2509
3524
|
#endif
|
|
2510
|
-
}
|
|
2511
3525
|
|
|
2512
|
-
template<>
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
Packet2d d = { static_cast<double>(tmp[0]),
|
|
2517
|
-
static_cast<double>(tmp[1]) };
|
|
2518
|
-
return d;
|
|
3526
|
+
template <>
|
|
3527
|
+
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
|
|
3528
|
+
Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
|
|
3529
|
+
return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
|
|
2519
3530
|
}
|
|
3531
|
+
#endif
|
|
3532
|
+
|
|
3533
|
+
template <>
|
|
3534
|
+
inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
|
|
2520
3535
|
|
|
3536
|
+
template <>
|
|
3537
|
+
inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
|
|
2521
3538
|
|
|
2522
3539
|
// Packet2l shifts.
|
|
2523
|
-
// For POWER8 we simply use vec_sr/l.
|
|
3540
|
+
// For POWER8 we simply use vec_sr/l.
|
|
2524
3541
|
//
|
|
2525
3542
|
// Things are more complicated for POWER7. There is actually a
|
|
2526
3543
|
// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
|
|
2527
3544
|
// So we need to shift by N % 32 and rearrage bytes.
|
|
2528
3545
|
#ifdef __POWER8_VECTOR__
|
|
2529
3546
|
|
|
2530
|
-
template<int N>
|
|
3547
|
+
template <int N>
|
|
2531
3548
|
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
|
|
2532
|
-
const Packet2ul shift = {
|
|
2533
|
-
return vec_sl(a, shift);
|
|
3549
|
+
const Packet2ul shift = {N, N};
|
|
3550
|
+
return vec_sl(a, shift);
|
|
2534
3551
|
}
|
|
2535
3552
|
|
|
2536
|
-
template<int N>
|
|
3553
|
+
template <int N>
|
|
2537
3554
|
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
|
2538
|
-
const Packet2ul shift = {
|
|
2539
|
-
return vec_sr(a, shift);
|
|
3555
|
+
const Packet2ul shift = {N, N};
|
|
3556
|
+
return vec_sr(a, shift);
|
|
2540
3557
|
}
|
|
2541
3558
|
|
|
2542
3559
|
#else
|
|
@@ -2544,34 +3561,32 @@ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
|
|
2544
3561
|
// Shifts [A, B, C, D] to [B, 0, D, 0].
|
|
2545
3562
|
// Used to implement left shifts for Packet2l.
|
|
2546
3563
|
EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
|
|
2547
|
-
static const Packet16uc perm = {
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
2551
|
-
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
#endif
|
|
3564
|
+
static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
|
|
3565
|
+
0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
|
|
3566
|
+
#ifdef _BIG_ENDIAN
|
|
3567
|
+
return vec_perm(p4i_ZERO, a, perm);
|
|
3568
|
+
#else
|
|
3569
|
+
return vec_perm(a, p4i_ZERO, perm);
|
|
3570
|
+
#endif
|
|
2555
3571
|
}
|
|
2556
3572
|
|
|
2557
3573
|
// Shifts [A, B, C, D] to [0, A, 0, C].
|
|
2558
3574
|
// Used to implement right shifts for Packet2l.
|
|
2559
3575
|
EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
|
|
2560
|
-
static const Packet16uc perm = {
|
|
2561
|
-
|
|
2562
|
-
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
|
|
2566
|
-
|
|
2567
|
-
#endif
|
|
3576
|
+
static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
|
|
3577
|
+
0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
|
|
3578
|
+
#ifdef _BIG_ENDIAN
|
|
3579
|
+
return vec_perm(p4i_ZERO, a, perm);
|
|
3580
|
+
#else
|
|
3581
|
+
return vec_perm(a, p4i_ZERO, perm);
|
|
3582
|
+
#endif
|
|
2568
3583
|
}
|
|
2569
3584
|
|
|
2570
|
-
template<int N, typename EnableIf = void>
|
|
3585
|
+
template <int N, typename EnableIf = void>
|
|
2571
3586
|
struct plogical_shift_left_impl;
|
|
2572
3587
|
|
|
2573
|
-
template<int N>
|
|
2574
|
-
struct plogical_shift_left_impl<N,
|
|
3588
|
+
template <int N>
|
|
3589
|
+
struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
|
|
2575
3590
|
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
|
|
2576
3591
|
static const unsigned n = static_cast<unsigned>(N);
|
|
2577
3592
|
const Packet4ui shift = {n, n, n, n};
|
|
@@ -2584,8 +3599,8 @@ struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::typ
|
|
|
2584
3599
|
}
|
|
2585
3600
|
};
|
|
2586
3601
|
|
|
2587
|
-
template<int N>
|
|
2588
|
-
struct plogical_shift_left_impl<N,
|
|
3602
|
+
template <int N>
|
|
3603
|
+
struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
|
|
2589
3604
|
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
|
|
2590
3605
|
static const unsigned m = static_cast<unsigned>(N - 32);
|
|
2591
3606
|
const Packet4ui shift = {m, m, m, m};
|
|
@@ -2594,16 +3609,16 @@ struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
|
|
|
2594
3609
|
}
|
|
2595
3610
|
};
|
|
2596
3611
|
|
|
2597
|
-
template<int N>
|
|
3612
|
+
template <int N>
|
|
2598
3613
|
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
|
|
2599
|
-
return plogical_shift_left_impl<N>::run(a);
|
|
3614
|
+
return plogical_shift_left_impl<N>::run(a);
|
|
2600
3615
|
}
|
|
2601
3616
|
|
|
2602
|
-
template<int N, typename EnableIf = void>
|
|
3617
|
+
template <int N, typename EnableIf = void>
|
|
2603
3618
|
struct plogical_shift_right_impl;
|
|
2604
3619
|
|
|
2605
|
-
template<int N>
|
|
2606
|
-
struct plogical_shift_right_impl<N,
|
|
3620
|
+
template <int N>
|
|
3621
|
+
struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
|
|
2607
3622
|
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
|
|
2608
3623
|
static const unsigned n = static_cast<unsigned>(N);
|
|
2609
3624
|
const Packet4ui shift = {n, n, n, n};
|
|
@@ -2616,8 +3631,8 @@ struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::ty
|
|
|
2616
3631
|
}
|
|
2617
3632
|
};
|
|
2618
3633
|
|
|
2619
|
-
template<int N>
|
|
2620
|
-
struct plogical_shift_right_impl<N,
|
|
3634
|
+
template <int N>
|
|
3635
|
+
struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
|
|
2621
3636
|
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
|
|
2622
3637
|
static const unsigned m = static_cast<unsigned>(N - 32);
|
|
2623
3638
|
const Packet4ui shift = {m, m, m, m};
|
|
@@ -2626,86 +3641,89 @@ struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
|
|
|
2626
3641
|
}
|
|
2627
3642
|
};
|
|
2628
3643
|
|
|
2629
|
-
template<int N>
|
|
3644
|
+
template <int N>
|
|
2630
3645
|
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
|
2631
|
-
return plogical_shift_right_impl<N>::run(a);
|
|
3646
|
+
return plogical_shift_right_impl<N>::run(a);
|
|
2632
3647
|
}
|
|
2633
3648
|
#endif
|
|
2634
3649
|
|
|
2635
|
-
template<>
|
|
3650
|
+
template <>
|
|
3651
|
+
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
|
|
2636
3652
|
// Clamp exponent to [-2099, 2099]
|
|
2637
3653
|
const Packet2d max_exponent = pset1<Packet2d>(2099.0);
|
|
2638
3654
|
const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
|
|
2639
3655
|
|
|
2640
3656
|
// Split 2^e into four factors and multiply:
|
|
2641
|
-
const Packet2l
|
|
3657
|
+
const Packet2l bias = {1023, 1023};
|
|
2642
3658
|
Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
|
|
2643
3659
|
Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
|
|
2644
|
-
Packet2d out = pmul(pmul(pmul(a, c), c), c);
|
|
2645
|
-
b = psub(psub(psub(e, b), b), b);
|
|
2646
|
-
c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
|
|
2647
|
-
out = pmul(out, c);
|
|
3660
|
+
Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
|
|
3661
|
+
b = psub(psub(psub(e, b), b), b); // e - 3b
|
|
3662
|
+
c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
|
|
3663
|
+
out = pmul(out, c); // a * 2^e
|
|
2648
3664
|
return out;
|
|
2649
3665
|
}
|
|
2650
3666
|
|
|
2651
|
-
|
|
2652
3667
|
// Extract exponent without existence of Packet2l.
|
|
2653
|
-
template<>
|
|
2654
|
-
EIGEN_STRONG_INLINE
|
|
2655
|
-
Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
|
|
3668
|
+
template <>
|
|
3669
|
+
EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
|
|
2656
3670
|
return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
|
|
2657
3671
|
}
|
|
2658
3672
|
|
|
2659
|
-
template<>
|
|
3673
|
+
template <>
|
|
3674
|
+
EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
|
|
2660
3675
|
return pfrexp_generic(a, exponent);
|
|
2661
3676
|
}
|
|
2662
3677
|
|
|
2663
|
-
template<>
|
|
2664
|
-
{
|
|
3678
|
+
template <>
|
|
3679
|
+
EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
|
|
2665
3680
|
Packet2d b, sum;
|
|
2666
|
-
b
|
|
3681
|
+
b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
|
|
2667
3682
|
sum = a + b;
|
|
2668
3683
|
return pfirst<Packet2d>(sum);
|
|
2669
3684
|
}
|
|
2670
3685
|
|
|
2671
3686
|
// Other reduction functions:
|
|
2672
3687
|
// mul
|
|
2673
|
-
template<>
|
|
2674
|
-
{
|
|
2675
|
-
return pfirst(
|
|
3688
|
+
template <>
|
|
3689
|
+
EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
|
|
3690
|
+
return pfirst(
|
|
3691
|
+
pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
|
|
2676
3692
|
}
|
|
2677
3693
|
|
|
2678
3694
|
// min
|
|
2679
|
-
template<>
|
|
2680
|
-
{
|
|
2681
|
-
return pfirst(
|
|
3695
|
+
template <>
|
|
3696
|
+
EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
|
|
3697
|
+
return pfirst(
|
|
3698
|
+
pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
|
|
2682
3699
|
}
|
|
2683
3700
|
|
|
2684
3701
|
// max
|
|
2685
|
-
template<>
|
|
2686
|
-
{
|
|
2687
|
-
return pfirst(
|
|
3702
|
+
template <>
|
|
3703
|
+
EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
|
|
3704
|
+
return pfirst(
|
|
3705
|
+
pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
|
|
2688
3706
|
}
|
|
2689
3707
|
|
|
2690
|
-
EIGEN_DEVICE_FUNC inline void
|
|
2691
|
-
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
|
3708
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
|
|
2692
3709
|
Packet2d t0, t1;
|
|
2693
|
-
t0 =
|
|
2694
|
-
t1 =
|
|
3710
|
+
t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
|
|
3711
|
+
t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
|
|
2695
3712
|
kernel.packet[0] = t0;
|
|
2696
3713
|
kernel.packet[1] = t1;
|
|
2697
3714
|
}
|
|
2698
3715
|
|
|
2699
|
-
template<>
|
|
2700
|
-
|
|
2701
|
-
|
|
3716
|
+
template <>
|
|
3717
|
+
EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
|
|
3718
|
+
const Packet2d& elsePacket) {
|
|
3719
|
+
Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
|
|
3720
|
+
Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
|
|
2702
3721
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
2703
3722
|
}
|
|
2704
3723
|
|
|
3724
|
+
#endif // __VSX__
|
|
3725
|
+
} // end namespace internal
|
|
2705
3726
|
|
|
2706
|
-
|
|
2707
|
-
} // end namespace internal
|
|
2708
|
-
|
|
2709
|
-
} // end namespace Eigen
|
|
3727
|
+
} // end namespace Eigen
|
|
2710
3728
|
|
|
2711
|
-
#endif
|
|
3729
|
+
#endif // EIGEN_PACKET_MATH_ALTIVEC_H
|