@smake/eigen 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -21
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +235 -326
- package/eigen/Eigen/Eigenvalues +16 -14
- package/eigen/Eigen/Geometry +21 -24
- package/eigen/Eigen/Householder +9 -8
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -14
- package/eigen/Eigen/KLUSupport +43 -0
- package/eigen/Eigen/LU +16 -20
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -54
- package/eigen/Eigen/PaStiXSupport +23 -20
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -21
- package/eigen/Eigen/QtAlignedMalloc +5 -13
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -18
- package/eigen/Eigen/Sparse +1 -4
- package/eigen/Eigen/SparseCholesky +18 -23
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +12 -8
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
- package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
- package/eigen/Eigen/src/Core/Array.h +341 -294
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
- package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
- package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
- package/eigen/Eigen/src/Core/Block.h +375 -398
- package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
- package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
- package/eigen/Eigen/src/Core/DenseBase.h +632 -571
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
- package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +169 -210
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +172 -222
- package/eigen/Eigen/src/Core/EigenBase.h +75 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
- package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
- package/eigen/Eigen/src/Core/IO.h +147 -139
- package/eigen/Eigen/src/Core/IndexedView.h +321 -0
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +56 -66
- package/eigen/Eigen/src/Core/Map.h +124 -142
- package/eigen/Eigen/src/Core/MapBase.h +256 -281
- package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
- package/eigen/Eigen/src/Core/Matrix.h +491 -416
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
- package/eigen/Eigen/src/Core/NestByValue.h +66 -85
- package/eigen/Eigen/src/Core/NoAlias.h +79 -85
- package/eigen/Eigen/src/Core/NumTraits.h +235 -148
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
- package/eigen/Eigen/src/Core/Product.h +260 -139
- package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
- package/eigen/Eigen/src/Core/Random.h +161 -136
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +366 -336
- package/eigen/Eigen/src/Core/Ref.h +308 -209
- package/eigen/Eigen/src/Core/Replicate.h +94 -106
- package/eigen/Eigen/src/Core/Reshaped.h +398 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
- package/eigen/Eigen/src/Core/Reverse.h +136 -145
- package/eigen/Eigen/src/Core/Select.h +70 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +97 -111
- package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
- package/eigen/Eigen/src/Core/SolverBase.h +138 -101
- package/eigen/Eigen/src/Core/StableNorm.h +156 -160
- package/eigen/Eigen/src/Core/StlIterators.h +619 -0
- package/eigen/Eigen/src/Core/Stride.h +91 -88
- package/eigen/Eigen/src/Core/Swap.h +70 -38
- package/eigen/Eigen/src/Core/Transpose.h +295 -273
- package/eigen/Eigen/src/Core/Transpositions.h +272 -317
- package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
- package/eigen/Eigen/src/Core/Visitor.h +480 -216
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
- package/eigen/Eigen/src/Core/util/Constants.h +314 -263
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
- package/eigen/Eigen/src/Core/util/Macros.h +939 -646
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
- package/eigen/Eigen/src/Core/util/Meta.h +618 -426
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
- package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
- package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
- package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
- package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
- package/eigen/Eigen/src/Geometry/Transform.h +896 -953
- package/eigen/Eigen/src/Geometry/Translation.h +100 -98
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
- package/eigen/Eigen/src/Householder/Householder.h +104 -122
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
- package/eigen/Eigen/src/LU/Determinant.h +60 -63
- package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
- package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
- package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
- package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
- package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -0,0 +1,2866 @@
|
|
|
1
|
+
// This file is part of Eigen, a lightweight C++ template library
|
|
2
|
+
// for linear algebra.
|
|
3
|
+
//
|
|
4
|
+
// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
|
|
5
|
+
// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
|
|
6
|
+
//
|
|
7
|
+
// This Source Code Form is subject to the terms of the Mozilla
|
|
8
|
+
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
9
|
+
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
10
|
+
|
|
11
|
+
#ifndef EIGEN_PACKET_MATH_LSX_H
|
|
12
|
+
#define EIGEN_PACKET_MATH_LSX_H
|
|
13
|
+
|
|
14
|
+
// IWYU pragma: private
|
|
15
|
+
#include "../../InternalHeaderCheck.h"
|
|
16
|
+
|
|
17
|
+
namespace Eigen {
|
|
18
|
+
|
|
19
|
+
namespace internal {
|
|
20
|
+
|
|
21
|
+
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
|
22
|
+
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
26
|
+
#if EIGEN_ARCH_LOONGARCH64
|
|
27
|
+
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
|
28
|
+
#endif
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
32
|
+
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
typedef __m128 Packet4f;
|
|
36
|
+
typedef __m128d Packet2d;
|
|
37
|
+
|
|
38
|
+
typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
|
|
39
|
+
typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
|
|
40
|
+
typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
|
|
41
|
+
typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
|
|
42
|
+
typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
|
|
43
|
+
typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
|
|
44
|
+
typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
|
|
45
|
+
typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
|
|
46
|
+
|
|
47
|
+
template <>
|
|
48
|
+
struct is_arithmetic<__m128> {
|
|
49
|
+
enum { value = true };
|
|
50
|
+
};
|
|
51
|
+
template <>
|
|
52
|
+
struct is_arithmetic<__m128i> {
|
|
53
|
+
enum { value = true };
|
|
54
|
+
};
|
|
55
|
+
template <>
|
|
56
|
+
struct is_arithmetic<__m128d> {
|
|
57
|
+
enum { value = true };
|
|
58
|
+
};
|
|
59
|
+
template <>
|
|
60
|
+
struct is_arithmetic<Packet16c> {
|
|
61
|
+
enum { value = true };
|
|
62
|
+
};
|
|
63
|
+
template <>
|
|
64
|
+
struct is_arithmetic<Packet8s> {
|
|
65
|
+
enum { value = true };
|
|
66
|
+
};
|
|
67
|
+
template <>
|
|
68
|
+
struct is_arithmetic<Packet4i> {
|
|
69
|
+
enum { value = true };
|
|
70
|
+
};
|
|
71
|
+
template <>
|
|
72
|
+
struct is_arithmetic<Packet2l> {
|
|
73
|
+
enum { value = true };
|
|
74
|
+
};
|
|
75
|
+
template <>
|
|
76
|
+
struct is_arithmetic<Packet16uc> {
|
|
77
|
+
enum { value = false };
|
|
78
|
+
};
|
|
79
|
+
template <>
|
|
80
|
+
struct is_arithmetic<Packet8us> {
|
|
81
|
+
enum { value = false };
|
|
82
|
+
};
|
|
83
|
+
template <>
|
|
84
|
+
struct is_arithmetic<Packet4ui> {
|
|
85
|
+
enum { value = false };
|
|
86
|
+
};
|
|
87
|
+
template <>
|
|
88
|
+
struct is_arithmetic<Packet2ul> {
|
|
89
|
+
enum { value = false };
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
|
|
93
|
+
float from[4] = {a, b, c, d};
|
|
94
|
+
return (Packet4f)__lsx_vld(from, 0);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
|
|
98
|
+
const float* a = reinterpret_cast<const float*>(&m);
|
|
99
|
+
Packet4f res =
|
|
100
|
+
make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
|
|
101
|
+
return res;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
template <bool interleave>
|
|
105
|
+
EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
|
|
106
|
+
const float* a = reinterpret_cast<const float*>(&m);
|
|
107
|
+
const float* b = reinterpret_cast<const float*>(&n);
|
|
108
|
+
Packet4f res =
|
|
109
|
+
make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
|
|
110
|
+
return res;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
template <>
|
|
114
|
+
EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
|
|
115
|
+
const float* a = reinterpret_cast<const float*>(&m);
|
|
116
|
+
const float* b = reinterpret_cast<const float*>(&n);
|
|
117
|
+
Packet4f res =
|
|
118
|
+
make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
|
|
119
|
+
return res;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) {
|
|
123
|
+
return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
|
|
127
|
+
return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
|
|
128
|
+
}
|
|
129
|
+
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
|
|
130
|
+
return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
|
|
131
|
+
}
|
|
132
|
+
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
|
|
133
|
+
return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
|
|
134
|
+
}
|
|
135
|
+
EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
|
|
136
|
+
return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
|
|
137
|
+
}
|
|
138
|
+
EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
|
|
139
|
+
return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
|
|
140
|
+
}
|
|
141
|
+
EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
|
|
142
|
+
return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
|
|
146
|
+
double from[2] = {a, b};
|
|
147
|
+
return (Packet2d)__lsx_vld(from, 0);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
|
|
151
|
+
const double* a = reinterpret_cast<const double*>(&m);
|
|
152
|
+
const double* b = reinterpret_cast<const double*>(&n);
|
|
153
|
+
Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
|
|
154
|
+
return res;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
|
|
158
|
+
return shuffle(a, b, mask);
|
|
159
|
+
}
|
|
160
|
+
EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
|
|
161
|
+
EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
|
|
162
|
+
|
|
163
|
+
template <>
|
|
164
|
+
struct packet_traits<int8_t> : default_packet_traits {
|
|
165
|
+
typedef Packet16c type;
|
|
166
|
+
typedef Packet16c half;
|
|
167
|
+
enum {
|
|
168
|
+
Vectorizable = 1,
|
|
169
|
+
AlignedOnScalar = 1,
|
|
170
|
+
size = 16,
|
|
171
|
+
|
|
172
|
+
HasAbs2 = 0,
|
|
173
|
+
HasSetLinear = 0,
|
|
174
|
+
HasCmp = 1,
|
|
175
|
+
HasBlend = 0
|
|
176
|
+
};
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
template <>
|
|
180
|
+
struct packet_traits<int16_t> : default_packet_traits {
|
|
181
|
+
typedef Packet8s type;
|
|
182
|
+
typedef Packet8s half;
|
|
183
|
+
enum {
|
|
184
|
+
Vectorizable = 1,
|
|
185
|
+
AlignedOnScalar = 1,
|
|
186
|
+
size = 8,
|
|
187
|
+
|
|
188
|
+
HasAbs2 = 0,
|
|
189
|
+
HasSetLinear = 0,
|
|
190
|
+
HasCmp = 1,
|
|
191
|
+
HasDiv = 1,
|
|
192
|
+
HasBlend = 0
|
|
193
|
+
};
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
template <>
|
|
197
|
+
struct packet_traits<int32_t> : default_packet_traits {
|
|
198
|
+
typedef Packet4i type;
|
|
199
|
+
typedef Packet4i half;
|
|
200
|
+
enum {
|
|
201
|
+
Vectorizable = 1,
|
|
202
|
+
AlignedOnScalar = 1,
|
|
203
|
+
size = 4,
|
|
204
|
+
|
|
205
|
+
HasAbs2 = 0,
|
|
206
|
+
HasSetLinear = 0,
|
|
207
|
+
HasCmp = 1,
|
|
208
|
+
HasDiv = 1,
|
|
209
|
+
HasBlend = 0
|
|
210
|
+
};
|
|
211
|
+
};
|
|
212
|
+
|
|
213
|
+
template <>
|
|
214
|
+
struct packet_traits<int64_t> : default_packet_traits {
|
|
215
|
+
typedef Packet2l type;
|
|
216
|
+
typedef Packet2l half;
|
|
217
|
+
enum {
|
|
218
|
+
Vectorizable = 1,
|
|
219
|
+
AlignedOnScalar = 1,
|
|
220
|
+
size = 2,
|
|
221
|
+
|
|
222
|
+
HasAbs2 = 0,
|
|
223
|
+
HasSetLinear = 0,
|
|
224
|
+
HasCmp = 1,
|
|
225
|
+
HasDiv = 1,
|
|
226
|
+
HasBlend = 0
|
|
227
|
+
};
|
|
228
|
+
};
|
|
229
|
+
|
|
230
|
+
template <>
|
|
231
|
+
struct packet_traits<uint8_t> : default_packet_traits {
|
|
232
|
+
typedef Packet16uc type;
|
|
233
|
+
typedef Packet16uc half;
|
|
234
|
+
enum {
|
|
235
|
+
Vectorizable = 1,
|
|
236
|
+
AlignedOnScalar = 1,
|
|
237
|
+
size = 16,
|
|
238
|
+
|
|
239
|
+
HasAbs2 = 0,
|
|
240
|
+
HasSetLinear = 0,
|
|
241
|
+
HasNegate = 0,
|
|
242
|
+
HasCmp = 1,
|
|
243
|
+
HasBlend = 0
|
|
244
|
+
};
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
template <>
|
|
248
|
+
struct packet_traits<uint16_t> : default_packet_traits {
|
|
249
|
+
typedef Packet8us type;
|
|
250
|
+
typedef Packet8us half;
|
|
251
|
+
enum {
|
|
252
|
+
Vectorizable = 1,
|
|
253
|
+
AlignedOnScalar = 1,
|
|
254
|
+
size = 8,
|
|
255
|
+
|
|
256
|
+
HasAbs2 = 0,
|
|
257
|
+
HasSetLinear = 0,
|
|
258
|
+
HasNegate = 0,
|
|
259
|
+
HasCmp = 1,
|
|
260
|
+
HasDiv = 1,
|
|
261
|
+
HasBlend = 0
|
|
262
|
+
};
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
template <>
|
|
266
|
+
struct packet_traits<uint32_t> : default_packet_traits {
|
|
267
|
+
typedef Packet4ui type;
|
|
268
|
+
typedef Packet4ui half;
|
|
269
|
+
enum {
|
|
270
|
+
Vectorizable = 1,
|
|
271
|
+
AlignedOnScalar = 1,
|
|
272
|
+
size = 4,
|
|
273
|
+
|
|
274
|
+
HasAbs2 = 0,
|
|
275
|
+
HasSetLinear = 0,
|
|
276
|
+
HasNegate = 0,
|
|
277
|
+
HasCmp = 1,
|
|
278
|
+
HasDiv = 1,
|
|
279
|
+
HasBlend = 0
|
|
280
|
+
};
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
template <>
|
|
284
|
+
struct packet_traits<uint64_t> : default_packet_traits {
|
|
285
|
+
typedef Packet2ul type;
|
|
286
|
+
typedef Packet2ul half;
|
|
287
|
+
enum {
|
|
288
|
+
Vectorizable = 1,
|
|
289
|
+
AlignedOnScalar = 1,
|
|
290
|
+
size = 2,
|
|
291
|
+
|
|
292
|
+
HasAbs2 = 0,
|
|
293
|
+
HasSetLinear = 0,
|
|
294
|
+
HasNegate = 0,
|
|
295
|
+
HasCmp = 1,
|
|
296
|
+
HasDiv = 1,
|
|
297
|
+
HasBlend = 0
|
|
298
|
+
};
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
template <>
|
|
302
|
+
struct packet_traits<float> : default_packet_traits {
|
|
303
|
+
typedef Packet4f type;
|
|
304
|
+
typedef Packet4f half;
|
|
305
|
+
enum {
|
|
306
|
+
Vectorizable = 1,
|
|
307
|
+
AlignedOnScalar = 1,
|
|
308
|
+
size = 4,
|
|
309
|
+
|
|
310
|
+
HasAbs2 = 0,
|
|
311
|
+
HasSetLinear = 0,
|
|
312
|
+
HasBlend = 0,
|
|
313
|
+
HasSign = 0,
|
|
314
|
+
HasDiv = 1,
|
|
315
|
+
HasExp = 1,
|
|
316
|
+
HasSqrt = 1,
|
|
317
|
+
HasLog = 1,
|
|
318
|
+
HasRsqrt = 1
|
|
319
|
+
};
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
template <>
|
|
323
|
+
struct packet_traits<double> : default_packet_traits {
|
|
324
|
+
typedef Packet2d type;
|
|
325
|
+
typedef Packet2d half;
|
|
326
|
+
enum {
|
|
327
|
+
Vectorizable = 1,
|
|
328
|
+
AlignedOnScalar = 1,
|
|
329
|
+
size = 2,
|
|
330
|
+
|
|
331
|
+
HasAbs2 = 0,
|
|
332
|
+
HasSetLinear = 0,
|
|
333
|
+
HasBlend = 0,
|
|
334
|
+
HasSign = 0,
|
|
335
|
+
HasDiv = 1,
|
|
336
|
+
HasSqrt = 1,
|
|
337
|
+
HasLog = 1,
|
|
338
|
+
HasRsqrt = 1
|
|
339
|
+
};
|
|
340
|
+
};
|
|
341
|
+
|
|
342
|
+
template <>
|
|
343
|
+
struct unpacket_traits<Packet16c> {
|
|
344
|
+
typedef int8_t type;
|
|
345
|
+
typedef Packet16c half;
|
|
346
|
+
enum {
|
|
347
|
+
size = 16,
|
|
348
|
+
alignment = Aligned16,
|
|
349
|
+
vectorizable = true,
|
|
350
|
+
masked_load_available = false,
|
|
351
|
+
masked_store_available = false
|
|
352
|
+
};
|
|
353
|
+
};
|
|
354
|
+
template <>
|
|
355
|
+
struct unpacket_traits<Packet8s> {
|
|
356
|
+
typedef int16_t type;
|
|
357
|
+
typedef Packet8s half;
|
|
358
|
+
enum {
|
|
359
|
+
size = 8,
|
|
360
|
+
alignment = Aligned16,
|
|
361
|
+
vectorizable = true,
|
|
362
|
+
masked_load_available = false,
|
|
363
|
+
masked_store_available = false
|
|
364
|
+
};
|
|
365
|
+
};
|
|
366
|
+
template <>
|
|
367
|
+
struct unpacket_traits<Packet4i> {
|
|
368
|
+
typedef int32_t type;
|
|
369
|
+
typedef Packet4i half;
|
|
370
|
+
enum {
|
|
371
|
+
size = 4,
|
|
372
|
+
alignment = Aligned16,
|
|
373
|
+
vectorizable = true,
|
|
374
|
+
masked_load_available = false,
|
|
375
|
+
masked_store_available = false
|
|
376
|
+
};
|
|
377
|
+
};
|
|
378
|
+
template <>
|
|
379
|
+
struct unpacket_traits<Packet2l> {
|
|
380
|
+
typedef int64_t type;
|
|
381
|
+
typedef Packet2l half;
|
|
382
|
+
enum {
|
|
383
|
+
size = 2,
|
|
384
|
+
alignment = Aligned16,
|
|
385
|
+
vectorizable = true,
|
|
386
|
+
masked_load_available = false,
|
|
387
|
+
masked_store_available = false
|
|
388
|
+
};
|
|
389
|
+
};
|
|
390
|
+
template <>
|
|
391
|
+
struct unpacket_traits<Packet16uc> {
|
|
392
|
+
typedef uint8_t type;
|
|
393
|
+
typedef Packet16uc half;
|
|
394
|
+
enum {
|
|
395
|
+
size = 16,
|
|
396
|
+
alignment = Aligned16,
|
|
397
|
+
vectorizable = true,
|
|
398
|
+
masked_load_available = false,
|
|
399
|
+
masked_store_available = false
|
|
400
|
+
};
|
|
401
|
+
};
|
|
402
|
+
template <>
|
|
403
|
+
struct unpacket_traits<Packet8us> {
|
|
404
|
+
typedef uint16_t type;
|
|
405
|
+
typedef Packet8us half;
|
|
406
|
+
enum {
|
|
407
|
+
size = 8,
|
|
408
|
+
alignment = Aligned16,
|
|
409
|
+
vectorizable = true,
|
|
410
|
+
masked_load_available = false,
|
|
411
|
+
masked_store_available = false
|
|
412
|
+
};
|
|
413
|
+
};
|
|
414
|
+
template <>
|
|
415
|
+
struct unpacket_traits<Packet4ui> {
|
|
416
|
+
typedef uint32_t type;
|
|
417
|
+
typedef Packet4ui half;
|
|
418
|
+
enum {
|
|
419
|
+
size = 4,
|
|
420
|
+
alignment = Aligned16,
|
|
421
|
+
vectorizable = true,
|
|
422
|
+
masked_load_available = false,
|
|
423
|
+
masked_store_available = false
|
|
424
|
+
};
|
|
425
|
+
};
|
|
426
|
+
template <>
|
|
427
|
+
struct unpacket_traits<Packet2ul> {
|
|
428
|
+
typedef uint64_t type;
|
|
429
|
+
typedef Packet2ul half;
|
|
430
|
+
enum {
|
|
431
|
+
size = 2,
|
|
432
|
+
alignment = Aligned16,
|
|
433
|
+
vectorizable = true,
|
|
434
|
+
masked_load_available = false,
|
|
435
|
+
masked_store_available = false
|
|
436
|
+
};
|
|
437
|
+
};
|
|
438
|
+
template <>
|
|
439
|
+
struct unpacket_traits<Packet4f> {
|
|
440
|
+
typedef float type;
|
|
441
|
+
typedef Packet4f half;
|
|
442
|
+
typedef Packet4i integer_packet;
|
|
443
|
+
enum {
|
|
444
|
+
size = 4,
|
|
445
|
+
alignment = Aligned16,
|
|
446
|
+
vectorizable = true,
|
|
447
|
+
masked_load_available = false,
|
|
448
|
+
masked_store_available = false
|
|
449
|
+
};
|
|
450
|
+
};
|
|
451
|
+
template <>
|
|
452
|
+
struct unpacket_traits<Packet2d> {
|
|
453
|
+
typedef double type;
|
|
454
|
+
typedef Packet2d half;
|
|
455
|
+
typedef Packet2l integer_packet;
|
|
456
|
+
enum {
|
|
457
|
+
size = 2,
|
|
458
|
+
alignment = Aligned16,
|
|
459
|
+
vectorizable = true,
|
|
460
|
+
masked_load_available = false,
|
|
461
|
+
masked_store_available = false
|
|
462
|
+
};
|
|
463
|
+
};
|
|
464
|
+
|
|
465
|
+
template <>
|
|
466
|
+
EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
|
|
467
|
+
return __lsx_vreplgr2vr_b(from);
|
|
468
|
+
}
|
|
469
|
+
template <>
|
|
470
|
+
EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
|
|
471
|
+
return __lsx_vreplgr2vr_h(from);
|
|
472
|
+
}
|
|
473
|
+
template <>
|
|
474
|
+
EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
|
|
475
|
+
return __lsx_vreplgr2vr_w(from);
|
|
476
|
+
}
|
|
477
|
+
template <>
|
|
478
|
+
EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
|
|
479
|
+
return __lsx_vreplgr2vr_d(from);
|
|
480
|
+
}
|
|
481
|
+
template <>
|
|
482
|
+
EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
|
|
483
|
+
return __lsx_vreplgr2vr_b(from);
|
|
484
|
+
}
|
|
485
|
+
template <>
|
|
486
|
+
EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
|
|
487
|
+
return __lsx_vreplgr2vr_h(from);
|
|
488
|
+
}
|
|
489
|
+
template <>
|
|
490
|
+
EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
|
|
491
|
+
return __lsx_vreplgr2vr_w(from);
|
|
492
|
+
}
|
|
493
|
+
template <>
|
|
494
|
+
EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
|
|
495
|
+
return __lsx_vreplgr2vr_d(from);
|
|
496
|
+
}
|
|
497
|
+
template <>
|
|
498
|
+
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
|
499
|
+
Packet4f v = {from, from, from, from};
|
|
500
|
+
return v;
|
|
501
|
+
}
|
|
502
|
+
template <>
|
|
503
|
+
EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
|
504
|
+
Packet2d v = {from, from};
|
|
505
|
+
return v;
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
template <>
|
|
509
|
+
EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
|
|
510
|
+
return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from));
|
|
511
|
+
}
|
|
512
|
+
template <>
|
|
513
|
+
EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
|
|
514
|
+
return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from));
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
template <>
|
|
518
|
+
EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
|
|
519
|
+
const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
520
|
+
return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
|
|
521
|
+
}
|
|
522
|
+
template <>
|
|
523
|
+
EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
|
|
524
|
+
const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
|
|
525
|
+
return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
|
|
526
|
+
}
|
|
527
|
+
template <>
|
|
528
|
+
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
|
|
529
|
+
const int32_t countdown[] = {0, 1, 2, 3};
|
|
530
|
+
return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
|
|
531
|
+
}
|
|
532
|
+
template <>
|
|
533
|
+
EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
|
|
534
|
+
const int64_t countdown[] = {0, 1};
|
|
535
|
+
return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
|
|
536
|
+
}
|
|
537
|
+
template <>
|
|
538
|
+
EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
|
|
539
|
+
const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
540
|
+
return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
|
|
541
|
+
}
|
|
542
|
+
template <>
|
|
543
|
+
EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
|
|
544
|
+
const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
|
|
545
|
+
return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
|
|
546
|
+
}
|
|
547
|
+
template <>
|
|
548
|
+
EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
|
|
549
|
+
const uint32_t countdown[] = {0, 1, 2, 3};
|
|
550
|
+
return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
|
|
551
|
+
}
|
|
552
|
+
template <>
|
|
553
|
+
EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
|
|
554
|
+
const uint64_t countdown[] = {0, 1};
|
|
555
|
+
return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
|
|
556
|
+
}
|
|
557
|
+
template <>
|
|
558
|
+
EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
|
|
559
|
+
static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
|
|
560
|
+
return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
|
|
561
|
+
}
|
|
562
|
+
template <>
|
|
563
|
+
EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
|
|
564
|
+
static const Packet2d countdown = {0.0f, 1.0f};
|
|
565
|
+
return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
template <>
|
|
569
|
+
EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
570
|
+
return __lsx_vadd_b(a, b);
|
|
571
|
+
}
|
|
572
|
+
template <>
|
|
573
|
+
EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
574
|
+
return __lsx_vadd_h(a, b);
|
|
575
|
+
}
|
|
576
|
+
template <>
|
|
577
|
+
EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
578
|
+
return __lsx_vadd_w(a, b);
|
|
579
|
+
}
|
|
580
|
+
template <>
|
|
581
|
+
EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
582
|
+
return __lsx_vadd_d(a, b);
|
|
583
|
+
}
|
|
584
|
+
template <>
|
|
585
|
+
EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
586
|
+
return __lsx_vadd_b(a, b);
|
|
587
|
+
}
|
|
588
|
+
template <>
|
|
589
|
+
EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
590
|
+
return __lsx_vadd_h(a, b);
|
|
591
|
+
}
|
|
592
|
+
template <>
|
|
593
|
+
EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
594
|
+
return __lsx_vadd_w(a, b);
|
|
595
|
+
}
|
|
596
|
+
template <>
|
|
597
|
+
EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
598
|
+
return __lsx_vadd_d(a, b);
|
|
599
|
+
}
|
|
600
|
+
template <>
|
|
601
|
+
EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
602
|
+
return __lsx_vfadd_s(a, b);
|
|
603
|
+
}
|
|
604
|
+
template <>
|
|
605
|
+
EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
606
|
+
return __lsx_vfadd_d(a, b);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
template <>
|
|
610
|
+
EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
611
|
+
return __lsx_vsub_b(a, b);
|
|
612
|
+
}
|
|
613
|
+
template <>
|
|
614
|
+
EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
615
|
+
return __lsx_vsub_h(a, b);
|
|
616
|
+
}
|
|
617
|
+
template <>
|
|
618
|
+
EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
619
|
+
return __lsx_vsub_w(a, b);
|
|
620
|
+
}
|
|
621
|
+
template <>
|
|
622
|
+
EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
623
|
+
return __lsx_vsub_d(a, b);
|
|
624
|
+
}
|
|
625
|
+
template <>
|
|
626
|
+
EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
627
|
+
return __lsx_vsub_b(a, b);
|
|
628
|
+
}
|
|
629
|
+
template <>
|
|
630
|
+
EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
631
|
+
return __lsx_vsub_h(a, b);
|
|
632
|
+
}
|
|
633
|
+
template <>
|
|
634
|
+
EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
635
|
+
return __lsx_vsub_w(a, b);
|
|
636
|
+
}
|
|
637
|
+
template <>
|
|
638
|
+
EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
639
|
+
return __lsx_vsub_d(a, b);
|
|
640
|
+
}
|
|
641
|
+
template <>
|
|
642
|
+
EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
643
|
+
return __lsx_vfsub_s(a, b);
|
|
644
|
+
}
|
|
645
|
+
template <>
|
|
646
|
+
EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
647
|
+
return __lsx_vfsub_d(a, b);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
template <>
|
|
651
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
|
|
652
|
+
template <>
|
|
653
|
+
EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
654
|
+
const Packet4f mask =
|
|
655
|
+
make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
|
|
656
|
+
return padd(a, pxor(mask, b));
|
|
657
|
+
}
|
|
658
|
+
template <>
|
|
659
|
+
EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b);
|
|
660
|
+
template <>
|
|
661
|
+
EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
662
|
+
const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
|
|
663
|
+
return padd(a, pxor(mask, b));
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
template <>
|
|
667
|
+
EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
|
|
668
|
+
Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
|
|
669
|
+
numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
|
|
670
|
+
return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
|
|
671
|
+
}
|
|
672
|
+
template <>
|
|
673
|
+
EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
|
|
674
|
+
Packet2d mask =
|
|
675
|
+
make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
|
|
676
|
+
return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
|
|
677
|
+
}
|
|
678
|
+
template <>
|
|
679
|
+
EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
|
|
680
|
+
return __lsx_vneg_b(a);
|
|
681
|
+
}
|
|
682
|
+
template <>
|
|
683
|
+
EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
|
|
684
|
+
return __lsx_vneg_h(a);
|
|
685
|
+
}
|
|
686
|
+
template <>
|
|
687
|
+
EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
|
|
688
|
+
return __lsx_vneg_w(a);
|
|
689
|
+
}
|
|
690
|
+
template <>
|
|
691
|
+
EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
|
|
692
|
+
return __lsx_vneg_d(a);
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
template <>
|
|
696
|
+
EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
|
|
697
|
+
return a;
|
|
698
|
+
}
|
|
699
|
+
template <>
|
|
700
|
+
EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
|
|
701
|
+
return a;
|
|
702
|
+
}
|
|
703
|
+
template <>
|
|
704
|
+
EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
|
|
705
|
+
return a;
|
|
706
|
+
}
|
|
707
|
+
template <>
|
|
708
|
+
EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
|
|
709
|
+
return a;
|
|
710
|
+
}
|
|
711
|
+
template <>
|
|
712
|
+
EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
|
|
713
|
+
return a;
|
|
714
|
+
}
|
|
715
|
+
template <>
|
|
716
|
+
EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
|
|
717
|
+
return a;
|
|
718
|
+
}
|
|
719
|
+
template <>
|
|
720
|
+
EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
|
|
721
|
+
return a;
|
|
722
|
+
}
|
|
723
|
+
template <>
|
|
724
|
+
EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
|
|
725
|
+
return a;
|
|
726
|
+
}
|
|
727
|
+
template <>
|
|
728
|
+
EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
|
|
729
|
+
return a;
|
|
730
|
+
}
|
|
731
|
+
template <>
|
|
732
|
+
EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
|
|
733
|
+
return a;
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
template <>
|
|
737
|
+
EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
738
|
+
return __lsx_vfmul_s(a, b);
|
|
739
|
+
}
|
|
740
|
+
template <>
|
|
741
|
+
EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
742
|
+
return __lsx_vfmul_d(a, b);
|
|
743
|
+
}
|
|
744
|
+
template <>
|
|
745
|
+
EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
746
|
+
return __lsx_vmul_b(a, b);
|
|
747
|
+
}
|
|
748
|
+
template <>
|
|
749
|
+
EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
750
|
+
return __lsx_vmul_h(a, b);
|
|
751
|
+
}
|
|
752
|
+
template <>
|
|
753
|
+
EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
754
|
+
return __lsx_vmul_w(a, b);
|
|
755
|
+
}
|
|
756
|
+
template <>
|
|
757
|
+
EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
758
|
+
return __lsx_vmul_d(a, b);
|
|
759
|
+
}
|
|
760
|
+
template <>
|
|
761
|
+
EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
762
|
+
return __lsx_vmul_b(a, b);
|
|
763
|
+
}
|
|
764
|
+
template <>
|
|
765
|
+
EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
766
|
+
return __lsx_vmul_h(a, b);
|
|
767
|
+
}
|
|
768
|
+
template <>
|
|
769
|
+
EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
770
|
+
return __lsx_vmul_w(a, b);
|
|
771
|
+
}
|
|
772
|
+
template <>
|
|
773
|
+
EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
774
|
+
return __lsx_vmul_d(a, b);
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
template <>
|
|
778
|
+
EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
779
|
+
return __lsx_vfdiv_s(a, b);
|
|
780
|
+
}
|
|
781
|
+
template <>
|
|
782
|
+
EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
783
|
+
return __lsx_vfdiv_d(a, b);
|
|
784
|
+
}
|
|
785
|
+
template <>
|
|
786
|
+
EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
787
|
+
return __lsx_vdiv_h(a, b);
|
|
788
|
+
}
|
|
789
|
+
template <>
|
|
790
|
+
EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
791
|
+
return __lsx_vdiv_w(a, b);
|
|
792
|
+
}
|
|
793
|
+
template <>
|
|
794
|
+
EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
795
|
+
return __lsx_vdiv_d(a, b);
|
|
796
|
+
}
|
|
797
|
+
template <>
|
|
798
|
+
EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
799
|
+
return __lsx_vdiv_hu(a, b);
|
|
800
|
+
}
|
|
801
|
+
template <>
|
|
802
|
+
EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
803
|
+
return __lsx_vdiv_wu(a, b);
|
|
804
|
+
}
|
|
805
|
+
template <>
|
|
806
|
+
EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
807
|
+
return __lsx_vdiv_du(a, b);
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
template <>
|
|
811
|
+
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
812
|
+
return __lsx_vfmadd_s(a, b, c);
|
|
813
|
+
}
|
|
814
|
+
template <>
|
|
815
|
+
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
816
|
+
return __lsx_vfmadd_d(a, b, c);
|
|
817
|
+
}
|
|
818
|
+
template <>
|
|
819
|
+
EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
820
|
+
return __lsx_vfmsub_s(a, b, c);
|
|
821
|
+
}
|
|
822
|
+
template <>
|
|
823
|
+
EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
824
|
+
return __lsx_vfmsub_d(a, b, c);
|
|
825
|
+
}
|
|
826
|
+
template <>
|
|
827
|
+
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
828
|
+
return __lsx_vfnmsub_s(a, b, c);
|
|
829
|
+
}
|
|
830
|
+
template <>
|
|
831
|
+
EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
832
|
+
return __lsx_vfnmsub_d(a, b, c);
|
|
833
|
+
}
|
|
834
|
+
template <>
|
|
835
|
+
EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
836
|
+
return __lsx_vfnmadd_s(a, b, c);
|
|
837
|
+
}
|
|
838
|
+
template <>
|
|
839
|
+
EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
840
|
+
return __lsx_vfnmadd_d(a, b, c);
|
|
841
|
+
}
|
|
842
|
+
template <>
|
|
843
|
+
EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
|
|
844
|
+
return __lsx_vmadd_b(c, a, b);
|
|
845
|
+
}
|
|
846
|
+
template <>
|
|
847
|
+
EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
|
|
848
|
+
return __lsx_vmadd_h(c, a, b);
|
|
849
|
+
}
|
|
850
|
+
template <>
|
|
851
|
+
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
852
|
+
return __lsx_vmadd_w(c, a, b);
|
|
853
|
+
}
|
|
854
|
+
template <>
|
|
855
|
+
EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
|
|
856
|
+
return __lsx_vmadd_d(c, a, b);
|
|
857
|
+
}
|
|
858
|
+
template <>
|
|
859
|
+
EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
|
|
860
|
+
return __lsx_vmadd_b(c, a, b);
|
|
861
|
+
}
|
|
862
|
+
template <>
|
|
863
|
+
EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
|
|
864
|
+
return __lsx_vmadd_h(c, a, b);
|
|
865
|
+
}
|
|
866
|
+
template <>
|
|
867
|
+
EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
|
|
868
|
+
return __lsx_vmadd_w(c, a, b);
|
|
869
|
+
}
|
|
870
|
+
template <>
|
|
871
|
+
EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) {
|
|
872
|
+
return __lsx_vmadd_d(c, a, b);
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
template <>
|
|
876
|
+
EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
877
|
+
return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
|
|
878
|
+
}
|
|
879
|
+
template <>
|
|
880
|
+
EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
881
|
+
return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
|
|
882
|
+
}
|
|
883
|
+
template <>
|
|
884
|
+
EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
885
|
+
return __lsx_vand_v(a, b);
|
|
886
|
+
}
|
|
887
|
+
template <>
|
|
888
|
+
EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
889
|
+
return __lsx_vand_v(a, b);
|
|
890
|
+
}
|
|
891
|
+
template <>
|
|
892
|
+
EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
893
|
+
return __lsx_vand_v(a, b);
|
|
894
|
+
}
|
|
895
|
+
template <>
|
|
896
|
+
EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
897
|
+
return __lsx_vand_v(a, b);
|
|
898
|
+
}
|
|
899
|
+
template <>
|
|
900
|
+
EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
901
|
+
return __lsx_vand_v(a, b);
|
|
902
|
+
}
|
|
903
|
+
template <>
|
|
904
|
+
EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
905
|
+
return __lsx_vand_v(a, b);
|
|
906
|
+
}
|
|
907
|
+
template <>
|
|
908
|
+
EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
909
|
+
return __lsx_vand_v(a, b);
|
|
910
|
+
}
|
|
911
|
+
template <>
|
|
912
|
+
EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
913
|
+
return __lsx_vand_v(a, b);
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
template <>
|
|
917
|
+
EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
918
|
+
return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
|
|
919
|
+
}
|
|
920
|
+
template <>
|
|
921
|
+
EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
922
|
+
return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
|
|
923
|
+
}
|
|
924
|
+
template <>
|
|
925
|
+
EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
926
|
+
return __lsx_vor_v(a, b);
|
|
927
|
+
}
|
|
928
|
+
template <>
|
|
929
|
+
EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
930
|
+
return __lsx_vor_v(a, b);
|
|
931
|
+
}
|
|
932
|
+
template <>
|
|
933
|
+
EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
934
|
+
return __lsx_vor_v(a, b);
|
|
935
|
+
}
|
|
936
|
+
template <>
|
|
937
|
+
EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
938
|
+
return __lsx_vor_v(a, b);
|
|
939
|
+
}
|
|
940
|
+
template <>
|
|
941
|
+
EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
942
|
+
return __lsx_vor_v(a, b);
|
|
943
|
+
}
|
|
944
|
+
template <>
|
|
945
|
+
EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
946
|
+
return __lsx_vor_v(a, b);
|
|
947
|
+
}
|
|
948
|
+
template <>
|
|
949
|
+
EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
950
|
+
return __lsx_vor_v(a, b);
|
|
951
|
+
}
|
|
952
|
+
template <>
|
|
953
|
+
EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
954
|
+
return __lsx_vor_v(a, b);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
template <>
|
|
958
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
959
|
+
return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
|
|
960
|
+
}
|
|
961
|
+
template <>
|
|
962
|
+
EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
963
|
+
return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
|
|
964
|
+
}
|
|
965
|
+
template <>
|
|
966
|
+
EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
967
|
+
return __lsx_vxor_v(a, b);
|
|
968
|
+
}
|
|
969
|
+
template <>
|
|
970
|
+
EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
971
|
+
return __lsx_vxor_v(a, b);
|
|
972
|
+
}
|
|
973
|
+
template <>
|
|
974
|
+
EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
975
|
+
return __lsx_vxor_v(a, b);
|
|
976
|
+
}
|
|
977
|
+
template <>
|
|
978
|
+
EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
979
|
+
return __lsx_vxor_v(a, b);
|
|
980
|
+
}
|
|
981
|
+
template <>
|
|
982
|
+
EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
983
|
+
return __lsx_vxor_v(a, b);
|
|
984
|
+
}
|
|
985
|
+
template <>
|
|
986
|
+
EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
987
|
+
return __lsx_vxor_v(a, b);
|
|
988
|
+
}
|
|
989
|
+
template <>
|
|
990
|
+
EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
991
|
+
return __lsx_vxor_v(a, b);
|
|
992
|
+
}
|
|
993
|
+
template <>
|
|
994
|
+
EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
995
|
+
return __lsx_vxor_v(a, b);
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
template <>
|
|
999
|
+
EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1000
|
+
return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
|
|
1001
|
+
}
|
|
1002
|
+
template <>
|
|
1003
|
+
EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1004
|
+
return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
|
|
1005
|
+
}
|
|
1006
|
+
template <>
|
|
1007
|
+
EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1008
|
+
return __lsx_vandn_v(b, a);
|
|
1009
|
+
}
|
|
1010
|
+
template <>
|
|
1011
|
+
EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1012
|
+
return __lsx_vandn_v(b, a);
|
|
1013
|
+
}
|
|
1014
|
+
template <>
|
|
1015
|
+
EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1016
|
+
return __lsx_vandn_v(b, a);
|
|
1017
|
+
}
|
|
1018
|
+
template <>
|
|
1019
|
+
EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
1020
|
+
return __lsx_vandn_v(b, a);
|
|
1021
|
+
}
|
|
1022
|
+
template <>
|
|
1023
|
+
EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1024
|
+
return __lsx_vandn_v(b, a);
|
|
1025
|
+
}
|
|
1026
|
+
template <>
|
|
1027
|
+
EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1028
|
+
return __lsx_vandn_v(b, a);
|
|
1029
|
+
}
|
|
1030
|
+
template <>
|
|
1031
|
+
EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1032
|
+
return __lsx_vandn_v(b, a);
|
|
1033
|
+
}
|
|
1034
|
+
template <>
|
|
1035
|
+
EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
1036
|
+
return __lsx_vandn_v(b, a);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
template <>
|
|
1040
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1041
|
+
return (Packet4f)__lsx_vfcmp_cle_s(a, b);
|
|
1042
|
+
}
|
|
1043
|
+
template <>
|
|
1044
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1045
|
+
return (Packet2d)__lsx_vfcmp_cle_d(a, b);
|
|
1046
|
+
}
|
|
1047
|
+
template <>
|
|
1048
|
+
EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1049
|
+
return __lsx_vsle_b(a, b);
|
|
1050
|
+
}
|
|
1051
|
+
template <>
|
|
1052
|
+
EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1053
|
+
return __lsx_vsle_h(a, b);
|
|
1054
|
+
}
|
|
1055
|
+
template <>
|
|
1056
|
+
EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1057
|
+
return __lsx_vsle_w(a, b);
|
|
1058
|
+
}
|
|
1059
|
+
template <>
|
|
1060
|
+
EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
1061
|
+
return __lsx_vsle_d(a, b);
|
|
1062
|
+
}
|
|
1063
|
+
template <>
|
|
1064
|
+
EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1065
|
+
return __lsx_vsle_bu(a, b);
|
|
1066
|
+
}
|
|
1067
|
+
template <>
|
|
1068
|
+
EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1069
|
+
return __lsx_vsle_hu(a, b);
|
|
1070
|
+
}
|
|
1071
|
+
template <>
|
|
1072
|
+
EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1073
|
+
return __lsx_vsle_wu(a, b);
|
|
1074
|
+
}
|
|
1075
|
+
template <>
|
|
1076
|
+
EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
1077
|
+
return __lsx_vsle_du(a, b);
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
template <>
|
|
1081
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1082
|
+
return (Packet4f)__lsx_vfcmp_clt_s(a, b);
|
|
1083
|
+
}
|
|
1084
|
+
template <>
|
|
1085
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1086
|
+
return (Packet2d)__lsx_vfcmp_clt_d(a, b);
|
|
1087
|
+
}
|
|
1088
|
+
template <>
|
|
1089
|
+
EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1090
|
+
return __lsx_vslt_b(a, b);
|
|
1091
|
+
}
|
|
1092
|
+
template <>
|
|
1093
|
+
EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1094
|
+
return __lsx_vslt_h(a, b);
|
|
1095
|
+
}
|
|
1096
|
+
template <>
|
|
1097
|
+
EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1098
|
+
return __lsx_vslt_w(a, b);
|
|
1099
|
+
}
|
|
1100
|
+
template <>
|
|
1101
|
+
EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
1102
|
+
return __lsx_vslt_d(a, b);
|
|
1103
|
+
}
|
|
1104
|
+
template <>
|
|
1105
|
+
EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1106
|
+
return __lsx_vslt_bu(a, b);
|
|
1107
|
+
}
|
|
1108
|
+
template <>
|
|
1109
|
+
EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1110
|
+
return __lsx_vslt_hu(a, b);
|
|
1111
|
+
}
|
|
1112
|
+
template <>
|
|
1113
|
+
EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1114
|
+
return __lsx_vslt_wu(a, b);
|
|
1115
|
+
}
|
|
1116
|
+
template <>
|
|
1117
|
+
EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
1118
|
+
return __lsx_vslt_du(a, b);
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
template <>
|
|
1122
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1123
|
+
return (Packet4f)__lsx_vfcmp_sult_s(a, b);
|
|
1124
|
+
}
|
|
1125
|
+
template <>
|
|
1126
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1127
|
+
return (Packet2d)__lsx_vfcmp_sult_d(a, b);
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
template <>
|
|
1131
|
+
EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1132
|
+
return (Packet4f)__lsx_vfcmp_seq_s(a, b);
|
|
1133
|
+
}
|
|
1134
|
+
template <>
|
|
1135
|
+
EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1136
|
+
return (Packet2d)__lsx_vfcmp_seq_d(a, b);
|
|
1137
|
+
}
|
|
1138
|
+
template <>
|
|
1139
|
+
EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1140
|
+
return __lsx_vseq_b(a, b);
|
|
1141
|
+
}
|
|
1142
|
+
template <>
|
|
1143
|
+
EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1144
|
+
return __lsx_vseq_h(a, b);
|
|
1145
|
+
}
|
|
1146
|
+
template <>
|
|
1147
|
+
EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1148
|
+
return __lsx_vseq_w(a, b);
|
|
1149
|
+
}
|
|
1150
|
+
template <>
|
|
1151
|
+
EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
1152
|
+
return __lsx_vseq_d(a, b);
|
|
1153
|
+
}
|
|
1154
|
+
template <>
|
|
1155
|
+
EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1156
|
+
return __lsx_vseq_b(a, b);
|
|
1157
|
+
}
|
|
1158
|
+
template <>
|
|
1159
|
+
EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1160
|
+
return __lsx_vseq_h(a, b);
|
|
1161
|
+
}
|
|
1162
|
+
template <>
|
|
1163
|
+
EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1164
|
+
return __lsx_vseq_w(a, b);
|
|
1165
|
+
}
|
|
1166
|
+
template <>
|
|
1167
|
+
EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
1168
|
+
return __lsx_vseq_d(a, b);
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
template <>
|
|
1172
|
+
EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1173
|
+
return __lsx_vmin_b(a, b);
|
|
1174
|
+
}
|
|
1175
|
+
template <>
|
|
1176
|
+
EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1177
|
+
return __lsx_vmin_h(a, b);
|
|
1178
|
+
}
|
|
1179
|
+
template <>
|
|
1180
|
+
EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1181
|
+
return __lsx_vmin_w(a, b);
|
|
1182
|
+
}
|
|
1183
|
+
template <>
|
|
1184
|
+
EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
1185
|
+
return __lsx_vmin_d(a, b);
|
|
1186
|
+
}
|
|
1187
|
+
template <>
|
|
1188
|
+
EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1189
|
+
return __lsx_vmin_bu(a, b);
|
|
1190
|
+
}
|
|
1191
|
+
template <>
|
|
1192
|
+
EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1193
|
+
return __lsx_vmin_hu(a, b);
|
|
1194
|
+
}
|
|
1195
|
+
template <>
|
|
1196
|
+
EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1197
|
+
return __lsx_vmin_wu(a, b);
|
|
1198
|
+
}
|
|
1199
|
+
template <>
|
|
1200
|
+
EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
1201
|
+
return __lsx_vmin_du(a, b);
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
template <>
|
|
1205
|
+
EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
1206
|
+
return __lsx_vmax_b(a, b);
|
|
1207
|
+
}
|
|
1208
|
+
template <>
|
|
1209
|
+
EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
1210
|
+
return __lsx_vmax_h(a, b);
|
|
1211
|
+
}
|
|
1212
|
+
template <>
|
|
1213
|
+
EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
1214
|
+
return __lsx_vmax_w(a, b);
|
|
1215
|
+
}
|
|
1216
|
+
template <>
|
|
1217
|
+
EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
|
1218
|
+
return __lsx_vmax_d(a, b);
|
|
1219
|
+
}
|
|
1220
|
+
template <>
|
|
1221
|
+
EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
1222
|
+
return __lsx_vmax_bu(a, b);
|
|
1223
|
+
}
|
|
1224
|
+
template <>
|
|
1225
|
+
EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
1226
|
+
return __lsx_vmax_hu(a, b);
|
|
1227
|
+
}
|
|
1228
|
+
template <>
|
|
1229
|
+
EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
1230
|
+
return __lsx_vmax_wu(a, b);
|
|
1231
|
+
}
|
|
1232
|
+
template <>
|
|
1233
|
+
EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
|
|
1234
|
+
return __lsx_vmax_du(a, b);
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
template <>
|
|
1238
|
+
EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1239
|
+
Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
|
|
1240
|
+
Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
|
|
1241
|
+
return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
|
|
1242
|
+
}
|
|
1243
|
+
template <>
|
|
1244
|
+
EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1245
|
+
Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
|
|
1246
|
+
Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
|
|
1247
|
+
return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
|
|
1248
|
+
}
|
|
1249
|
+
template <>
|
|
1250
|
+
EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1251
|
+
Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
|
|
1252
|
+
Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
|
|
1253
|
+
return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
|
|
1254
|
+
}
|
|
1255
|
+
template <>
|
|
1256
|
+
EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
1257
|
+
Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
|
|
1258
|
+
Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
|
|
1259
|
+
return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
template <int N>
|
|
1263
|
+
EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) {
|
|
1264
|
+
return __lsx_vsrai_b((__m128i)a, N);
|
|
1265
|
+
}
|
|
1266
|
+
template <int N>
|
|
1267
|
+
EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) {
|
|
1268
|
+
return __lsx_vsrai_h((__m128i)a, N);
|
|
1269
|
+
}
|
|
1270
|
+
template <int N>
|
|
1271
|
+
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
|
|
1272
|
+
return __lsx_vsrai_w((__m128i)a, N);
|
|
1273
|
+
}
|
|
1274
|
+
template <int N>
|
|
1275
|
+
EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
|
|
1276
|
+
return __lsx_vsrai_d((__m128i)a, N);
|
|
1277
|
+
}
|
|
1278
|
+
template <int N>
|
|
1279
|
+
EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) {
|
|
1280
|
+
return __lsx_vsrli_b((__m128i)a, N);
|
|
1281
|
+
}
|
|
1282
|
+
template <int N>
|
|
1283
|
+
EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) {
|
|
1284
|
+
return __lsx_vsrli_h((__m128i)a, N);
|
|
1285
|
+
}
|
|
1286
|
+
template <int N>
|
|
1287
|
+
EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
|
|
1288
|
+
return __lsx_vsrli_w((__m128i)a, N);
|
|
1289
|
+
}
|
|
1290
|
+
template <int N>
|
|
1291
|
+
EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) {
|
|
1292
|
+
return __lsx_vsrli_d((__m128i)a, N);
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
template <int N>
|
|
1296
|
+
EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) {
|
|
1297
|
+
return __lsx_vsrli_b((__m128i)a, N);
|
|
1298
|
+
}
|
|
1299
|
+
template <int N>
|
|
1300
|
+
EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) {
|
|
1301
|
+
return __lsx_vsrli_h((__m128i)a, N);
|
|
1302
|
+
}
|
|
1303
|
+
template <int N>
|
|
1304
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
|
|
1305
|
+
return __lsx_vsrli_w((__m128i)a, N);
|
|
1306
|
+
}
|
|
1307
|
+
template <int N>
|
|
1308
|
+
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
|
1309
|
+
return __lsx_vsrli_d((__m128i)a, N);
|
|
1310
|
+
}
|
|
1311
|
+
template <int N>
|
|
1312
|
+
EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) {
|
|
1313
|
+
return __lsx_vsrli_b((__m128i)a, N);
|
|
1314
|
+
}
|
|
1315
|
+
template <int N>
|
|
1316
|
+
EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
|
|
1317
|
+
return __lsx_vsrli_h((__m128i)a, N);
|
|
1318
|
+
}
|
|
1319
|
+
template <int N>
|
|
1320
|
+
EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
|
|
1321
|
+
return __lsx_vsrli_w((__m128i)a, N);
|
|
1322
|
+
}
|
|
1323
|
+
template <int N>
|
|
1324
|
+
EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) {
|
|
1325
|
+
return __lsx_vsrli_d((__m128i)a, N);
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
template <int N>
|
|
1329
|
+
EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) {
|
|
1330
|
+
return __lsx_vslli_b((__m128i)a, N);
|
|
1331
|
+
}
|
|
1332
|
+
template <int N>
|
|
1333
|
+
EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) {
|
|
1334
|
+
return __lsx_vslli_h((__m128i)a, N);
|
|
1335
|
+
}
|
|
1336
|
+
template <int N>
|
|
1337
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
|
|
1338
|
+
return __lsx_vslli_w((__m128i)a, N);
|
|
1339
|
+
}
|
|
1340
|
+
template <int N>
|
|
1341
|
+
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
|
|
1342
|
+
return __lsx_vslli_d((__m128i)a, N);
|
|
1343
|
+
}
|
|
1344
|
+
template <int N>
|
|
1345
|
+
EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) {
|
|
1346
|
+
return __lsx_vslli_b((__m128i)a, N);
|
|
1347
|
+
}
|
|
1348
|
+
template <int N>
|
|
1349
|
+
EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
|
|
1350
|
+
return __lsx_vslli_h((__m128i)a, N);
|
|
1351
|
+
}
|
|
1352
|
+
template <int N>
|
|
1353
|
+
EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
|
|
1354
|
+
return __lsx_vslli_w((__m128i)a, N);
|
|
1355
|
+
}
|
|
1356
|
+
template <int N>
|
|
1357
|
+
EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) {
|
|
1358
|
+
return __lsx_vslli_d((__m128i)a, N);
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
template <>
|
|
1362
|
+
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
|
|
1363
|
+
return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
|
|
1364
|
+
}
|
|
1365
|
+
template <>
|
|
1366
|
+
EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
|
|
1367
|
+
return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
|
|
1368
|
+
}
|
|
1369
|
+
template <>
|
|
1370
|
+
EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
|
|
1371
|
+
return __lsx_vabsd_b(a, pzero(a));
|
|
1372
|
+
}
|
|
1373
|
+
template <>
|
|
1374
|
+
EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
|
|
1375
|
+
return __lsx_vabsd_h(a, pzero(a));
|
|
1376
|
+
}
|
|
1377
|
+
template <>
|
|
1378
|
+
EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
|
|
1379
|
+
return __lsx_vabsd_w(a, pzero(a));
|
|
1380
|
+
}
|
|
1381
|
+
template <>
|
|
1382
|
+
EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
|
|
1383
|
+
return __lsx_vabsd_d(a, pzero(a));
|
|
1384
|
+
}
|
|
1385
|
+
template <>
|
|
1386
|
+
EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
|
|
1387
|
+
return a;
|
|
1388
|
+
}
|
|
1389
|
+
template <>
|
|
1390
|
+
EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
|
|
1391
|
+
return a;
|
|
1392
|
+
}
|
|
1393
|
+
template <>
|
|
1394
|
+
EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
|
|
1395
|
+
return a;
|
|
1396
|
+
}
|
|
1397
|
+
template <>
|
|
1398
|
+
EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
|
|
1399
|
+
return a;
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
template <>
|
|
1403
|
+
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
|
|
1404
|
+
EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
|
|
1405
|
+
}
|
|
1406
|
+
template <>
|
|
1407
|
+
EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
|
|
1408
|
+
EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
|
|
1409
|
+
}
|
|
1410
|
+
template <>
|
|
1411
|
+
EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
|
|
1412
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1413
|
+
}
|
|
1414
|
+
template <>
|
|
1415
|
+
EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
|
|
1416
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1417
|
+
}
|
|
1418
|
+
template <>
|
|
1419
|
+
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
|
|
1420
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1421
|
+
}
|
|
1422
|
+
template <>
|
|
1423
|
+
EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
|
|
1424
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1425
|
+
}
|
|
1426
|
+
template <>
|
|
1427
|
+
EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
|
|
1428
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1429
|
+
}
|
|
1430
|
+
template <>
|
|
1431
|
+
EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
|
|
1432
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1433
|
+
}
|
|
1434
|
+
template <>
|
|
1435
|
+
EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
|
|
1436
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1437
|
+
}
|
|
1438
|
+
template <>
|
|
1439
|
+
EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
|
|
1440
|
+
EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
template <>
|
|
1444
|
+
EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
|
1445
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
|
|
1446
|
+
}
|
|
1447
|
+
template <>
|
|
1448
|
+
EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
|
|
1449
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
|
|
1450
|
+
}
|
|
1451
|
+
template <>
|
|
1452
|
+
EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
|
|
1453
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1454
|
+
}
|
|
1455
|
+
template <>
|
|
1456
|
+
EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
|
|
1457
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1458
|
+
}
|
|
1459
|
+
template <>
|
|
1460
|
+
EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
|
|
1461
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1462
|
+
}
|
|
1463
|
+
template <>
|
|
1464
|
+
EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
|
|
1465
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1466
|
+
}
|
|
1467
|
+
template <>
|
|
1468
|
+
EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
|
|
1469
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1470
|
+
}
|
|
1471
|
+
template <>
|
|
1472
|
+
EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
|
|
1473
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1474
|
+
}
|
|
1475
|
+
template <>
|
|
1476
|
+
EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
|
|
1477
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1478
|
+
}
|
|
1479
|
+
template <>
|
|
1480
|
+
EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
|
|
1481
|
+
EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
|
|
1482
|
+
}
|
|
1483
|
+
|
|
1484
|
+
template <>
|
|
1485
|
+
EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
|
|
1486
|
+
float f0 = from[0], f1 = from[1];
|
|
1487
|
+
return make_packet4f(f0, f0, f1, f1);
|
|
1488
|
+
}
|
|
1489
|
+
template <>
|
|
1490
|
+
EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
|
|
1491
|
+
return pset1<Packet2d>(from[0]);
|
|
1492
|
+
}
|
|
1493
|
+
template <>
|
|
1494
|
+
EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
|
|
1495
|
+
Packet16c tmp = pload<Packet16c>(from);
|
|
1496
|
+
return __lsx_vilvl_b(tmp, tmp);
|
|
1497
|
+
}
|
|
1498
|
+
template <>
|
|
1499
|
+
EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
|
|
1500
|
+
Packet8s tmp = pload<Packet8s>(from);
|
|
1501
|
+
return __lsx_vilvl_h(tmp, tmp);
|
|
1502
|
+
}
|
|
1503
|
+
template <>
|
|
1504
|
+
EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
|
|
1505
|
+
Packet4i tmp = pload<Packet4i>(from);
|
|
1506
|
+
return __lsx_vilvl_w(tmp, tmp);
|
|
1507
|
+
}
|
|
1508
|
+
template <>
|
|
1509
|
+
EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
|
|
1510
|
+
return pset1<Packet2l>(from[0]);
|
|
1511
|
+
}
|
|
1512
|
+
template <>
|
|
1513
|
+
EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
|
|
1514
|
+
Packet16uc tmp = pload<Packet16uc>(from);
|
|
1515
|
+
return __lsx_vilvl_b(tmp, tmp);
|
|
1516
|
+
}
|
|
1517
|
+
template <>
|
|
1518
|
+
EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
|
|
1519
|
+
Packet8us tmp = pload<Packet8us>(from);
|
|
1520
|
+
return __lsx_vilvl_h(tmp, tmp);
|
|
1521
|
+
}
|
|
1522
|
+
template <>
|
|
1523
|
+
EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
|
|
1524
|
+
Packet4ui tmp = pload<Packet4ui>(from);
|
|
1525
|
+
return __lsx_vilvl_w(tmp, tmp);
|
|
1526
|
+
}
|
|
1527
|
+
template <>
|
|
1528
|
+
EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
|
|
1529
|
+
return pset1<Packet2ul>(from[0]);
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1532
|
+
template <>
|
|
1533
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
|
|
1534
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
|
|
1535
|
+
}
|
|
1536
|
+
template <>
|
|
1537
|
+
EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
|
|
1538
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
|
|
1539
|
+
}
|
|
1540
|
+
template <>
|
|
1541
|
+
EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
|
|
1542
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1543
|
+
}
|
|
1544
|
+
template <>
|
|
1545
|
+
EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
|
|
1546
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1547
|
+
}
|
|
1548
|
+
template <>
|
|
1549
|
+
EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
|
|
1550
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1551
|
+
}
|
|
1552
|
+
template <>
|
|
1553
|
+
EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
|
|
1554
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1555
|
+
}
|
|
1556
|
+
template <>
|
|
1557
|
+
EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
|
|
1558
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1559
|
+
}
|
|
1560
|
+
template <>
|
|
1561
|
+
EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
|
|
1562
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1563
|
+
}
|
|
1564
|
+
template <>
|
|
1565
|
+
EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
|
|
1566
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1567
|
+
}
|
|
1568
|
+
template <>
|
|
1569
|
+
EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
|
|
1570
|
+
EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
template <>
|
|
1574
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
|
|
1575
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
|
|
1576
|
+
}
|
|
1577
|
+
template <>
|
|
1578
|
+
EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
|
|
1579
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
template <>
|
|
1583
|
+
EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
|
|
1584
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1585
|
+
}
|
|
1586
|
+
template <>
|
|
1587
|
+
EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
|
|
1588
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1589
|
+
}
|
|
1590
|
+
template <>
|
|
1591
|
+
EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
|
|
1592
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1593
|
+
}
|
|
1594
|
+
template <>
|
|
1595
|
+
EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
|
|
1596
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1597
|
+
}
|
|
1598
|
+
template <>
|
|
1599
|
+
EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
|
|
1600
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1601
|
+
}
|
|
1602
|
+
template <>
|
|
1603
|
+
EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
|
|
1604
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1605
|
+
}
|
|
1606
|
+
template <>
|
|
1607
|
+
EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
|
|
1608
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1609
|
+
}
|
|
1610
|
+
template <>
|
|
1611
|
+
EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
|
|
1612
|
+
EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
|
|
1613
|
+
}
|
|
1614
|
+
|
|
1615
|
+
template <>
|
|
1616
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
|
1617
|
+
Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
|
|
1618
|
+
return v;
|
|
1619
|
+
}
|
|
1620
|
+
template <>
|
|
1621
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
|
|
1622
|
+
Packet2d v = {from[0], from[stride]};
|
|
1623
|
+
return v;
|
|
1624
|
+
}
|
|
1625
|
+
template <>
|
|
1626
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
|
|
1627
|
+
int8_t v[16] __attribute__((aligned(16)));
|
|
1628
|
+
v[0] = from[0];
|
|
1629
|
+
v[1] = from[stride];
|
|
1630
|
+
v[2] = from[2 * stride];
|
|
1631
|
+
v[3] = from[3 * stride];
|
|
1632
|
+
v[4] = from[4 * stride];
|
|
1633
|
+
v[5] = from[5 * stride];
|
|
1634
|
+
v[6] = from[6 * stride];
|
|
1635
|
+
v[7] = from[7 * stride];
|
|
1636
|
+
v[8] = from[8 * stride];
|
|
1637
|
+
v[9] = from[9 * stride];
|
|
1638
|
+
v[10] = from[10 * stride];
|
|
1639
|
+
v[11] = from[11 * stride];
|
|
1640
|
+
v[12] = from[12 * stride];
|
|
1641
|
+
v[13] = from[13 * stride];
|
|
1642
|
+
v[14] = from[14 * stride];
|
|
1643
|
+
v[15] = from[15 * stride];
|
|
1644
|
+
return __lsx_vld(v, 0);
|
|
1645
|
+
}
|
|
1646
|
+
template <>
|
|
1647
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
|
|
1648
|
+
int16_t v[8] __attribute__((aligned(16)));
|
|
1649
|
+
v[0] = from[0];
|
|
1650
|
+
v[1] = from[stride];
|
|
1651
|
+
v[2] = from[2 * stride];
|
|
1652
|
+
v[3] = from[3 * stride];
|
|
1653
|
+
v[4] = from[4 * stride];
|
|
1654
|
+
v[5] = from[5 * stride];
|
|
1655
|
+
v[6] = from[6 * stride];
|
|
1656
|
+
v[7] = from[7 * stride];
|
|
1657
|
+
return __lsx_vld(v, 0);
|
|
1658
|
+
}
|
|
1659
|
+
template <>
|
|
1660
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
|
|
1661
|
+
int32_t v[4] __attribute__((aligned(16)));
|
|
1662
|
+
v[0] = from[0];
|
|
1663
|
+
v[1] = from[stride];
|
|
1664
|
+
v[2] = from[2 * stride];
|
|
1665
|
+
v[3] = from[3 * stride];
|
|
1666
|
+
return __lsx_vld(v, 0);
|
|
1667
|
+
}
|
|
1668
|
+
template <>
|
|
1669
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
|
|
1670
|
+
int64_t v[2] __attribute__((aligned(16)));
|
|
1671
|
+
v[0] = from[0];
|
|
1672
|
+
v[1] = from[stride];
|
|
1673
|
+
return __lsx_vld(v, 0);
|
|
1674
|
+
}
|
|
1675
|
+
template <>
|
|
1676
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
|
|
1677
|
+
uint8_t v[16] __attribute__((aligned(16)));
|
|
1678
|
+
v[0] = from[0];
|
|
1679
|
+
v[1] = from[stride];
|
|
1680
|
+
v[2] = from[2 * stride];
|
|
1681
|
+
v[3] = from[3 * stride];
|
|
1682
|
+
v[4] = from[4 * stride];
|
|
1683
|
+
v[5] = from[5 * stride];
|
|
1684
|
+
v[6] = from[6 * stride];
|
|
1685
|
+
v[7] = from[7 * stride];
|
|
1686
|
+
v[8] = from[8 * stride];
|
|
1687
|
+
v[9] = from[9 * stride];
|
|
1688
|
+
v[10] = from[10 * stride];
|
|
1689
|
+
v[11] = from[11 * stride];
|
|
1690
|
+
v[12] = from[12 * stride];
|
|
1691
|
+
v[13] = from[13 * stride];
|
|
1692
|
+
v[14] = from[14 * stride];
|
|
1693
|
+
v[15] = from[15 * stride];
|
|
1694
|
+
return __lsx_vld(v, 0);
|
|
1695
|
+
}
|
|
1696
|
+
template <>
|
|
1697
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
|
|
1698
|
+
uint16_t v[8] __attribute__((aligned(16)));
|
|
1699
|
+
v[0] = from[0];
|
|
1700
|
+
v[1] = from[stride];
|
|
1701
|
+
v[2] = from[2 * stride];
|
|
1702
|
+
v[3] = from[3 * stride];
|
|
1703
|
+
v[4] = from[4 * stride];
|
|
1704
|
+
v[5] = from[5 * stride];
|
|
1705
|
+
v[6] = from[6 * stride];
|
|
1706
|
+
v[7] = from[7 * stride];
|
|
1707
|
+
return __lsx_vld(v, 0);
|
|
1708
|
+
}
|
|
1709
|
+
template <>
|
|
1710
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
|
|
1711
|
+
uint32_t v[4] __attribute__((aligned(16)));
|
|
1712
|
+
v[0] = from[0];
|
|
1713
|
+
v[1] = from[stride];
|
|
1714
|
+
v[2] = from[2 * stride];
|
|
1715
|
+
v[3] = from[3 * stride];
|
|
1716
|
+
return __lsx_vld(v, 0);
|
|
1717
|
+
}
|
|
1718
|
+
template <>
|
|
1719
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
|
|
1720
|
+
uint64_t v[2] __attribute__((aligned(16)));
|
|
1721
|
+
v[0] = from[0];
|
|
1722
|
+
v[1] = from[stride];
|
|
1723
|
+
return __lsx_vld(v, 0);
|
|
1724
|
+
}
|
|
1725
|
+
|
|
1726
|
+
template <>
|
|
1727
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
|
1728
|
+
__lsx_vstelm_w(from, to, 0, 0);
|
|
1729
|
+
__lsx_vstelm_w(from, to + stride * 1, 0, 1);
|
|
1730
|
+
__lsx_vstelm_w(from, to + stride * 2, 0, 2);
|
|
1731
|
+
__lsx_vstelm_w(from, to + stride * 3, 0, 3);
|
|
1732
|
+
}
|
|
1733
|
+
template <>
|
|
1734
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
|
|
1735
|
+
__lsx_vstelm_d(from, to, 0, 0);
|
|
1736
|
+
__lsx_vstelm_d(from, to + stride, 0, 1);
|
|
1737
|
+
}
|
|
1738
|
+
template <>
|
|
1739
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
|
|
1740
|
+
Index stride) {
|
|
1741
|
+
__lsx_vstelm_b((__m128i)from, to, 0, 0);
|
|
1742
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
|
|
1743
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
|
|
1744
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
|
|
1745
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
|
|
1746
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
|
|
1747
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
|
|
1748
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
|
|
1749
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
|
|
1750
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
|
|
1751
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
|
|
1752
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
|
|
1753
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
|
|
1754
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
|
|
1755
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
|
|
1756
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
|
|
1757
|
+
}
|
|
1758
|
+
template <>
|
|
1759
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
|
|
1760
|
+
Index stride) {
|
|
1761
|
+
__lsx_vstelm_h((__m128i)from, to, 0, 0);
|
|
1762
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
|
|
1763
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
|
|
1764
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
|
|
1765
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
|
|
1766
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
|
|
1767
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
|
|
1768
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
|
|
1769
|
+
}
|
|
1770
|
+
template <>
|
|
1771
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
|
|
1772
|
+
Index stride) {
|
|
1773
|
+
__lsx_vstelm_w((__m128i)from, to, 0, 0);
|
|
1774
|
+
__lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
|
|
1775
|
+
__lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
|
|
1776
|
+
__lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
|
|
1777
|
+
}
|
|
1778
|
+
template <>
|
|
1779
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
|
|
1780
|
+
Index stride) {
|
|
1781
|
+
__lsx_vstelm_d((__m128i)from, to, 0, 0);
|
|
1782
|
+
__lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
|
|
1783
|
+
}
|
|
1784
|
+
template <>
|
|
1785
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
|
|
1786
|
+
Index stride) {
|
|
1787
|
+
__lsx_vstelm_b((__m128i)from, to, 0, 0);
|
|
1788
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
|
|
1789
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
|
|
1790
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
|
|
1791
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
|
|
1792
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
|
|
1793
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
|
|
1794
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
|
|
1795
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
|
|
1796
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
|
|
1797
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
|
|
1798
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
|
|
1799
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
|
|
1800
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
|
|
1801
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
|
|
1802
|
+
__lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
|
|
1803
|
+
}
|
|
1804
|
+
template <>
|
|
1805
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
|
|
1806
|
+
Index stride) {
|
|
1807
|
+
__lsx_vstelm_h((__m128i)from, to, 0, 0);
|
|
1808
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
|
|
1809
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
|
|
1810
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
|
|
1811
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
|
|
1812
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
|
|
1813
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
|
|
1814
|
+
__lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
|
|
1815
|
+
}
|
|
1816
|
+
template <>
|
|
1817
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
|
|
1818
|
+
Index stride) {
|
|
1819
|
+
__lsx_vstelm_w((__m128i)from, to, 0, 0);
|
|
1820
|
+
__lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
|
|
1821
|
+
__lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
|
|
1822
|
+
__lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
|
|
1823
|
+
}
|
|
1824
|
+
template <>
|
|
1825
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
|
|
1826
|
+
Index stride) {
|
|
1827
|
+
__lsx_vstelm_d((__m128i)from, to, 0, 0);
|
|
1828
|
+
__lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
|
|
1829
|
+
}
|
|
1830
|
+
|
|
1831
|
+
template <>
|
|
1832
|
+
EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
|
|
1833
|
+
__builtin_prefetch(addr);
|
|
1834
|
+
}
|
|
1835
|
+
template <>
|
|
1836
|
+
EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
|
|
1837
|
+
__builtin_prefetch(addr);
|
|
1838
|
+
}
|
|
1839
|
+
template <>
|
|
1840
|
+
EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
|
|
1841
|
+
__builtin_prefetch(addr);
|
|
1842
|
+
}
|
|
1843
|
+
template <>
|
|
1844
|
+
EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
|
|
1845
|
+
__builtin_prefetch(addr);
|
|
1846
|
+
}
|
|
1847
|
+
template <>
|
|
1848
|
+
EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
|
|
1849
|
+
__builtin_prefetch(addr);
|
|
1850
|
+
}
|
|
1851
|
+
template <>
|
|
1852
|
+
EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
|
|
1853
|
+
__builtin_prefetch(addr);
|
|
1854
|
+
}
|
|
1855
|
+
template <>
|
|
1856
|
+
EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
|
|
1857
|
+
__builtin_prefetch(addr);
|
|
1858
|
+
}
|
|
1859
|
+
template <>
|
|
1860
|
+
EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
|
|
1861
|
+
__builtin_prefetch(addr);
|
|
1862
|
+
}
|
|
1863
|
+
template <>
|
|
1864
|
+
EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
|
|
1865
|
+
__builtin_prefetch(addr);
|
|
1866
|
+
}
|
|
1867
|
+
template <>
|
|
1868
|
+
EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
|
|
1869
|
+
__builtin_prefetch(addr);
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
template <>
|
|
1873
|
+
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
|
1874
|
+
float v;
|
|
1875
|
+
__lsx_vstelm_w(a, &v, 0, 0);
|
|
1876
|
+
return v;
|
|
1877
|
+
}
|
|
1878
|
+
template <>
|
|
1879
|
+
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
|
1880
|
+
double v;
|
|
1881
|
+
__lsx_vstelm_d(a, &v, 0, 0);
|
|
1882
|
+
return v;
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
template <>
|
|
1886
|
+
EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
|
|
1887
|
+
return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
|
|
1888
|
+
}
|
|
1889
|
+
template <>
|
|
1890
|
+
EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
|
|
1891
|
+
return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
|
|
1892
|
+
}
|
|
1893
|
+
template <>
|
|
1894
|
+
EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
|
|
1895
|
+
return __lsx_vpickve2gr_w((__m128i)a, 0);
|
|
1896
|
+
}
|
|
1897
|
+
template <>
|
|
1898
|
+
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
|
|
1899
|
+
return __lsx_vpickve2gr_d((__m128i)a, 0);
|
|
1900
|
+
}
|
|
1901
|
+
template <>
|
|
1902
|
+
EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
|
|
1903
|
+
return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
|
|
1904
|
+
}
|
|
1905
|
+
template <>
|
|
1906
|
+
EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
|
|
1907
|
+
return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
|
|
1908
|
+
}
|
|
1909
|
+
template <>
|
|
1910
|
+
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
|
1911
|
+
return __lsx_vpickve2gr_wu((__m128i)a, 0);
|
|
1912
|
+
}
|
|
1913
|
+
template <>
|
|
1914
|
+
EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
|
|
1915
|
+
return __lsx_vpickve2gr_du((__m128i)a, 0);
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1918
|
+
template <>
|
|
1919
|
+
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
|
1920
|
+
return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
|
|
1921
|
+
}
|
|
1922
|
+
template <>
|
|
1923
|
+
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
|
|
1924
|
+
return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
|
|
1925
|
+
}
|
|
1926
|
+
template <>
|
|
1927
|
+
EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
|
|
1928
|
+
return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
|
|
1929
|
+
}
|
|
1930
|
+
template <>
|
|
1931
|
+
EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
|
|
1932
|
+
return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
|
|
1933
|
+
}
|
|
1934
|
+
template <>
|
|
1935
|
+
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
|
|
1936
|
+
return __lsx_vshuf4i_w((__m128i)a, 0x1B);
|
|
1937
|
+
}
|
|
1938
|
+
template <>
|
|
1939
|
+
EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
|
|
1940
|
+
return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
|
|
1941
|
+
}
|
|
1942
|
+
template <>
|
|
1943
|
+
EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
|
|
1944
|
+
return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
|
|
1945
|
+
}
|
|
1946
|
+
template <>
|
|
1947
|
+
EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
|
|
1948
|
+
return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
|
|
1949
|
+
}
|
|
1950
|
+
template <>
|
|
1951
|
+
EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
|
|
1952
|
+
return __lsx_vshuf4i_w((__m128i)a, 0x1B);
|
|
1953
|
+
}
|
|
1954
|
+
template <>
|
|
1955
|
+
EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
|
|
1956
|
+
return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
template <>
|
|
1960
|
+
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
|
|
1961
|
+
Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
|
|
1962
|
+
return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
|
|
1963
|
+
}
|
|
1964
|
+
template <>
|
|
1965
|
+
EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
|
|
1966
|
+
return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
|
|
1967
|
+
}
|
|
1968
|
+
template <>
|
|
1969
|
+
EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
|
|
1970
|
+
Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
|
|
1971
|
+
Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
|
|
1972
|
+
Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
|
|
1973
|
+
return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
|
|
1974
|
+
}
|
|
1975
|
+
template <>
|
|
1976
|
+
EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
|
|
1977
|
+
Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
|
|
1978
|
+
Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
|
|
1979
|
+
return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
|
|
1980
|
+
}
|
|
1981
|
+
template <>
|
|
1982
|
+
EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
|
|
1983
|
+
Packet2l tmp = __lsx_vhaddw_d_w(a, a);
|
|
1984
|
+
return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
|
|
1985
|
+
}
|
|
1986
|
+
template <>
|
|
1987
|
+
EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
|
|
1988
|
+
return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
|
|
1989
|
+
}
|
|
1990
|
+
template <>
|
|
1991
|
+
EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
|
|
1992
|
+
Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
|
|
1993
|
+
Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
|
|
1994
|
+
Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
|
|
1995
|
+
return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
|
|
1996
|
+
}
|
|
1997
|
+
template <>
|
|
1998
|
+
EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
|
|
1999
|
+
Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
|
|
2000
|
+
Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
|
|
2001
|
+
return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
|
|
2002
|
+
}
|
|
2003
|
+
template <>
|
|
2004
|
+
EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
|
|
2005
|
+
Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
|
|
2006
|
+
return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
|
|
2007
|
+
}
|
|
2008
|
+
template <>
|
|
2009
|
+
EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
|
|
2010
|
+
return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
|
|
2011
|
+
}
|
|
2012
|
+
|
|
2013
|
+
template <>
|
|
2014
|
+
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
|
|
2015
|
+
Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
|
|
2016
|
+
return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
|
|
2017
|
+
}
|
|
2018
|
+
template <>
|
|
2019
|
+
EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
|
|
2020
|
+
return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
|
|
2021
|
+
}
|
|
2022
|
+
template <>
|
|
2023
|
+
EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
|
|
2024
|
+
Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
|
|
2025
|
+
Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
|
|
2026
|
+
Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
|
|
2027
|
+
return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
|
|
2028
|
+
}
|
|
2029
|
+
template <>
|
|
2030
|
+
EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
|
|
2031
|
+
Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
|
|
2032
|
+
Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
|
|
2033
|
+
return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
|
|
2034
|
+
}
|
|
2035
|
+
template <>
|
|
2036
|
+
EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
|
|
2037
|
+
Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
|
|
2038
|
+
return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
|
|
2039
|
+
}
|
|
2040
|
+
template <>
|
|
2041
|
+
EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
|
|
2042
|
+
return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
|
|
2043
|
+
}
|
|
2044
|
+
template <>
|
|
2045
|
+
EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
|
|
2046
|
+
Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
|
|
2047
|
+
Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
|
|
2048
|
+
Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
|
|
2049
|
+
return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
|
|
2050
|
+
}
|
|
2051
|
+
template <>
|
|
2052
|
+
EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
|
|
2053
|
+
Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
|
|
2054
|
+
Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
|
|
2055
|
+
return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
|
|
2056
|
+
}
|
|
2057
|
+
template <>
|
|
2058
|
+
EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
|
|
2059
|
+
Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
|
|
2060
|
+
return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
|
|
2061
|
+
}
|
|
2062
|
+
template <>
|
|
2063
|
+
EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
|
|
2064
|
+
return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
template <>
|
|
2068
|
+
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
|
|
2069
|
+
Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
|
|
2070
|
+
return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
|
|
2071
|
+
}
|
|
2072
|
+
template <>
|
|
2073
|
+
EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
|
|
2074
|
+
return pfirst(__lsx_vfmin_d(a, preverse(a)));
|
|
2075
|
+
}
|
|
2076
|
+
template <>
|
|
2077
|
+
EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
|
|
2078
|
+
Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2079
|
+
Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2080
|
+
Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
|
|
2081
|
+
return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
|
|
2082
|
+
}
|
|
2083
|
+
template <>
|
|
2084
|
+
EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
|
|
2085
|
+
Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2086
|
+
Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2087
|
+
return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
|
|
2088
|
+
}
|
|
2089
|
+
template <>
|
|
2090
|
+
EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
|
|
2091
|
+
Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2092
|
+
return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
|
|
2093
|
+
}
|
|
2094
|
+
template <>
|
|
2095
|
+
EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
|
|
2096
|
+
return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
|
|
2097
|
+
}
|
|
2098
|
+
template <>
|
|
2099
|
+
EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
|
|
2100
|
+
Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2101
|
+
Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2102
|
+
Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
|
|
2103
|
+
return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
|
|
2104
|
+
}
|
|
2105
|
+
template <>
|
|
2106
|
+
EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
|
|
2107
|
+
Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2108
|
+
Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2109
|
+
return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
|
|
2110
|
+
}
|
|
2111
|
+
template <>
|
|
2112
|
+
EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
|
|
2113
|
+
Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2114
|
+
return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
|
|
2115
|
+
}
|
|
2116
|
+
template <>
|
|
2117
|
+
EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
|
|
2118
|
+
return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
|
|
2119
|
+
}
|
|
2120
|
+
|
|
2121
|
+
template <>
|
|
2122
|
+
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
|
|
2123
|
+
Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
|
|
2124
|
+
return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
|
|
2125
|
+
}
|
|
2126
|
+
template <>
|
|
2127
|
+
EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
|
|
2128
|
+
return pfirst(__lsx_vfmax_d(a, preverse(a)));
|
|
2129
|
+
}
|
|
2130
|
+
template <>
|
|
2131
|
+
EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
|
|
2132
|
+
Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2133
|
+
Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2134
|
+
Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
|
|
2135
|
+
return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
|
|
2136
|
+
}
|
|
2137
|
+
template <>
|
|
2138
|
+
EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
|
|
2139
|
+
Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2140
|
+
Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2141
|
+
return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
|
|
2142
|
+
}
|
|
2143
|
+
template <>
|
|
2144
|
+
EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
|
|
2145
|
+
Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2146
|
+
return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
|
|
2147
|
+
}
|
|
2148
|
+
template <>
|
|
2149
|
+
EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
|
|
2150
|
+
return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
|
|
2151
|
+
}
|
|
2152
|
+
template <>
|
|
2153
|
+
EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
|
|
2154
|
+
Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2155
|
+
Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2156
|
+
Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
|
|
2157
|
+
return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
|
|
2158
|
+
}
|
|
2159
|
+
template <>
|
|
2160
|
+
EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
|
|
2161
|
+
Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2162
|
+
Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
|
|
2163
|
+
return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
|
|
2164
|
+
}
|
|
2165
|
+
template <>
|
|
2166
|
+
EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
|
|
2167
|
+
Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
|
|
2168
|
+
return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
|
|
2169
|
+
}
|
|
2170
|
+
template <>
|
|
2171
|
+
EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
|
|
2172
|
+
return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
template <>
|
|
2176
|
+
EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
|
|
2177
|
+
return __lsx_vfsqrt_s(a);
|
|
2178
|
+
}
|
|
2179
|
+
template <>
|
|
2180
|
+
EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
|
|
2181
|
+
return __lsx_vfsqrt_d(a);
|
|
2182
|
+
}
|
|
2183
|
+
|
|
2184
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
|
2185
|
+
Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
|
|
2186
|
+
Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
|
|
2187
|
+
Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
|
|
2188
|
+
Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
|
|
2189
|
+
|
|
2190
|
+
kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
|
|
2191
|
+
kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
|
|
2192
|
+
kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
|
|
2193
|
+
kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
|
|
2194
|
+
}
|
|
2195
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
|
|
2196
|
+
Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
|
|
2197
|
+
kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
|
|
2198
|
+
kernel.packet[1] = tmp;
|
|
2199
|
+
}
|
|
2200
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
|
|
2201
|
+
__m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
|
|
2202
|
+
__m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
|
|
2203
|
+
__m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
|
|
2204
|
+
__m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
|
|
2205
|
+
__m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
|
|
2206
|
+
__m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
|
|
2207
|
+
__m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
|
|
2208
|
+
__m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
|
|
2209
|
+
__m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
|
|
2210
|
+
__m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
|
|
2211
|
+
__m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
|
|
2212
|
+
__m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
|
|
2213
|
+
__m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
|
|
2214
|
+
__m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
|
|
2215
|
+
__m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
|
|
2216
|
+
__m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
|
|
2217
|
+
|
|
2218
|
+
__m128i s0 = __lsx_vilvl_h(t2, t0);
|
|
2219
|
+
__m128i s1 = __lsx_vilvh_h(t2, t0);
|
|
2220
|
+
__m128i s2 = __lsx_vilvl_h(t3, t1);
|
|
2221
|
+
__m128i s3 = __lsx_vilvh_h(t3, t1);
|
|
2222
|
+
__m128i s4 = __lsx_vilvl_h(t6, t4);
|
|
2223
|
+
__m128i s5 = __lsx_vilvh_h(t6, t4);
|
|
2224
|
+
__m128i s6 = __lsx_vilvl_h(t7, t5);
|
|
2225
|
+
__m128i s7 = __lsx_vilvh_h(t7, t5);
|
|
2226
|
+
__m128i s8 = __lsx_vilvl_h(ta, t8);
|
|
2227
|
+
__m128i s9 = __lsx_vilvh_h(ta, t8);
|
|
2228
|
+
__m128i sa = __lsx_vilvl_h(tb, t9);
|
|
2229
|
+
__m128i sb = __lsx_vilvh_h(tb, t9);
|
|
2230
|
+
__m128i sc = __lsx_vilvl_h(te, tc);
|
|
2231
|
+
__m128i sd = __lsx_vilvh_h(te, tc);
|
|
2232
|
+
__m128i se = __lsx_vilvl_h(tf, td);
|
|
2233
|
+
__m128i sf = __lsx_vilvh_h(tf, td);
|
|
2234
|
+
|
|
2235
|
+
__m128i u0 = __lsx_vilvl_w(s4, s0);
|
|
2236
|
+
__m128i u1 = __lsx_vilvh_w(s4, s0);
|
|
2237
|
+
__m128i u2 = __lsx_vilvl_w(s5, s1);
|
|
2238
|
+
__m128i u3 = __lsx_vilvh_w(s5, s1);
|
|
2239
|
+
__m128i u4 = __lsx_vilvl_w(s6, s2);
|
|
2240
|
+
__m128i u5 = __lsx_vilvh_w(s6, s2);
|
|
2241
|
+
__m128i u6 = __lsx_vilvl_w(s7, s3);
|
|
2242
|
+
__m128i u7 = __lsx_vilvh_w(s7, s3);
|
|
2243
|
+
__m128i u8 = __lsx_vilvl_w(sc, s8);
|
|
2244
|
+
__m128i u9 = __lsx_vilvh_w(sc, s8);
|
|
2245
|
+
__m128i ua = __lsx_vilvl_w(sd, s9);
|
|
2246
|
+
__m128i ub = __lsx_vilvh_w(sd, s9);
|
|
2247
|
+
__m128i uc = __lsx_vilvl_w(se, sa);
|
|
2248
|
+
__m128i ud = __lsx_vilvh_w(se, sa);
|
|
2249
|
+
__m128i ue = __lsx_vilvl_w(sf, sb);
|
|
2250
|
+
__m128i uf = __lsx_vilvh_w(sf, sb);
|
|
2251
|
+
|
|
2252
|
+
kernel.packet[0] = __lsx_vilvl_d(u8, u0);
|
|
2253
|
+
kernel.packet[1] = __lsx_vilvh_d(u8, u0);
|
|
2254
|
+
kernel.packet[2] = __lsx_vilvl_d(u9, u1);
|
|
2255
|
+
kernel.packet[3] = __lsx_vilvh_d(u9, u1);
|
|
2256
|
+
kernel.packet[4] = __lsx_vilvl_d(ua, u2);
|
|
2257
|
+
kernel.packet[5] = __lsx_vilvh_d(ua, u2);
|
|
2258
|
+
kernel.packet[6] = __lsx_vilvl_d(ub, u3);
|
|
2259
|
+
kernel.packet[7] = __lsx_vilvh_d(ub, u3);
|
|
2260
|
+
kernel.packet[8] = __lsx_vilvl_d(uc, u4);
|
|
2261
|
+
kernel.packet[9] = __lsx_vilvh_d(uc, u4);
|
|
2262
|
+
kernel.packet[10] = __lsx_vilvl_d(ud, u5);
|
|
2263
|
+
kernel.packet[11] = __lsx_vilvh_d(ud, u5);
|
|
2264
|
+
kernel.packet[12] = __lsx_vilvl_d(ue, u6);
|
|
2265
|
+
kernel.packet[13] = __lsx_vilvh_d(ue, u6);
|
|
2266
|
+
kernel.packet[14] = __lsx_vilvl_d(uf, u7);
|
|
2267
|
+
kernel.packet[15] = __lsx_vilvh_d(uf, u7);
|
|
2268
|
+
}
|
|
2269
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
|
|
2270
|
+
__m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
|
|
2271
|
+
__m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
|
|
2272
|
+
__m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
|
|
2273
|
+
__m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
|
|
2274
|
+
__m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
|
|
2275
|
+
__m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
|
|
2276
|
+
__m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
|
|
2277
|
+
__m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
|
|
2278
|
+
|
|
2279
|
+
__m128i s0 = __lsx_vilvl_h(t2, t0);
|
|
2280
|
+
__m128i s1 = __lsx_vilvh_h(t2, t0);
|
|
2281
|
+
__m128i s2 = __lsx_vilvl_h(t3, t1);
|
|
2282
|
+
__m128i s3 = __lsx_vilvh_h(t3, t1);
|
|
2283
|
+
__m128i s4 = __lsx_vilvl_h(t6, t4);
|
|
2284
|
+
__m128i s5 = __lsx_vilvh_h(t6, t4);
|
|
2285
|
+
__m128i s6 = __lsx_vilvl_h(t7, t5);
|
|
2286
|
+
__m128i s7 = __lsx_vilvh_h(t7, t5);
|
|
2287
|
+
|
|
2288
|
+
kernel.packet[0] = __lsx_vilvl_w(s4, s0);
|
|
2289
|
+
kernel.packet[1] = __lsx_vilvh_w(s4, s0);
|
|
2290
|
+
kernel.packet[2] = __lsx_vilvl_w(s5, s1);
|
|
2291
|
+
kernel.packet[3] = __lsx_vilvh_w(s5, s1);
|
|
2292
|
+
kernel.packet[4] = __lsx_vilvl_w(s6, s2);
|
|
2293
|
+
kernel.packet[5] = __lsx_vilvh_w(s6, s2);
|
|
2294
|
+
kernel.packet[6] = __lsx_vilvl_w(s7, s3);
|
|
2295
|
+
kernel.packet[7] = __lsx_vilvh_w(s7, s3);
|
|
2296
|
+
}
|
|
2297
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
|
|
2298
|
+
__m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
|
|
2299
|
+
__m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
|
|
2300
|
+
__m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
|
|
2301
|
+
__m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
|
|
2302
|
+
|
|
2303
|
+
kernel.packet[0] = __lsx_vilvl_h(t2, t0);
|
|
2304
|
+
kernel.packet[1] = __lsx_vilvh_h(t2, t0);
|
|
2305
|
+
kernel.packet[2] = __lsx_vilvl_h(t3, t1);
|
|
2306
|
+
kernel.packet[3] = __lsx_vilvh_h(t3, t1);
|
|
2307
|
+
}
|
|
2308
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
|
|
2309
|
+
__m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
|
|
2310
|
+
__m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
|
|
2311
|
+
__m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
|
|
2312
|
+
__m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
|
|
2313
|
+
__m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
|
|
2314
|
+
__m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
|
|
2315
|
+
__m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
|
|
2316
|
+
__m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
|
|
2317
|
+
|
|
2318
|
+
__m128i s0 = __lsx_vilvl_w(t2, t0);
|
|
2319
|
+
__m128i s1 = __lsx_vilvh_w(t2, t0);
|
|
2320
|
+
__m128i s2 = __lsx_vilvl_w(t3, t1);
|
|
2321
|
+
__m128i s3 = __lsx_vilvh_w(t3, t1);
|
|
2322
|
+
__m128i s4 = __lsx_vilvl_w(t6, t4);
|
|
2323
|
+
__m128i s5 = __lsx_vilvh_w(t6, t4);
|
|
2324
|
+
__m128i s6 = __lsx_vilvl_w(t7, t5);
|
|
2325
|
+
__m128i s7 = __lsx_vilvh_w(t7, t5);
|
|
2326
|
+
|
|
2327
|
+
kernel.packet[0] = __lsx_vilvl_d(s4, s0);
|
|
2328
|
+
kernel.packet[1] = __lsx_vilvh_d(s4, s0);
|
|
2329
|
+
kernel.packet[2] = __lsx_vilvl_d(s5, s1);
|
|
2330
|
+
kernel.packet[3] = __lsx_vilvh_d(s5, s1);
|
|
2331
|
+
kernel.packet[4] = __lsx_vilvl_d(s6, s2);
|
|
2332
|
+
kernel.packet[5] = __lsx_vilvh_d(s6, s2);
|
|
2333
|
+
kernel.packet[6] = __lsx_vilvl_d(s7, s3);
|
|
2334
|
+
kernel.packet[7] = __lsx_vilvh_d(s7, s3);
|
|
2335
|
+
}
|
|
2336
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
|
|
2337
|
+
__m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
|
|
2338
|
+
__m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
|
|
2339
|
+
__m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
|
|
2340
|
+
__m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
|
|
2341
|
+
|
|
2342
|
+
kernel.packet[0] = __lsx_vilvl_w(t2, t0);
|
|
2343
|
+
kernel.packet[1] = __lsx_vilvh_w(t2, t0);
|
|
2344
|
+
kernel.packet[2] = __lsx_vilvl_w(t3, t1);
|
|
2345
|
+
kernel.packet[3] = __lsx_vilvh_w(t3, t1);
|
|
2346
|
+
}
|
|
2347
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
|
|
2348
|
+
__m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
|
|
2349
|
+
__m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
|
|
2350
|
+
__m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
|
|
2351
|
+
__m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
|
|
2352
|
+
|
|
2353
|
+
kernel.packet[0] = __lsx_vilvl_d(T2, T0);
|
|
2354
|
+
kernel.packet[1] = __lsx_vilvh_d(T2, T0);
|
|
2355
|
+
kernel.packet[2] = __lsx_vilvl_d(T3, T1);
|
|
2356
|
+
kernel.packet[3] = __lsx_vilvh_d(T3, T1);
|
|
2357
|
+
}
|
|
2358
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
|
|
2359
|
+
__m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
|
|
2360
|
+
kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
|
|
2361
|
+
kernel.packet[1] = tmp;
|
|
2362
|
+
}
|
|
2363
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
|
|
2364
|
+
__m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
|
|
2365
|
+
__m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
|
|
2366
|
+
__m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
|
|
2367
|
+
__m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
|
|
2368
|
+
__m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
|
|
2369
|
+
__m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
|
|
2370
|
+
__m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
|
|
2371
|
+
__m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
|
|
2372
|
+
__m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
|
|
2373
|
+
__m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
|
|
2374
|
+
__m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
|
|
2375
|
+
__m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
|
|
2376
|
+
__m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
|
|
2377
|
+
__m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
|
|
2378
|
+
__m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
|
|
2379
|
+
__m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
|
|
2380
|
+
|
|
2381
|
+
__m128i s0 = __lsx_vilvl_h(t2, t0);
|
|
2382
|
+
__m128i s1 = __lsx_vilvh_h(t2, t0);
|
|
2383
|
+
__m128i s2 = __lsx_vilvl_h(t3, t1);
|
|
2384
|
+
__m128i s3 = __lsx_vilvh_h(t3, t1);
|
|
2385
|
+
__m128i s4 = __lsx_vilvl_h(t6, t4);
|
|
2386
|
+
__m128i s5 = __lsx_vilvh_h(t6, t4);
|
|
2387
|
+
__m128i s6 = __lsx_vilvl_h(t7, t5);
|
|
2388
|
+
__m128i s7 = __lsx_vilvh_h(t7, t5);
|
|
2389
|
+
__m128i s8 = __lsx_vilvl_h(ta, t8);
|
|
2390
|
+
__m128i s9 = __lsx_vilvh_h(ta, t8);
|
|
2391
|
+
__m128i sa = __lsx_vilvl_h(tb, t9);
|
|
2392
|
+
__m128i sb = __lsx_vilvh_h(tb, t9);
|
|
2393
|
+
__m128i sc = __lsx_vilvl_h(te, tc);
|
|
2394
|
+
__m128i sd = __lsx_vilvh_h(te, tc);
|
|
2395
|
+
__m128i se = __lsx_vilvl_h(tf, td);
|
|
2396
|
+
__m128i sf = __lsx_vilvh_h(tf, td);
|
|
2397
|
+
|
|
2398
|
+
__m128i u0 = __lsx_vilvl_w(s4, s0);
|
|
2399
|
+
__m128i u1 = __lsx_vilvh_w(s4, s0);
|
|
2400
|
+
__m128i u2 = __lsx_vilvl_w(s5, s1);
|
|
2401
|
+
__m128i u3 = __lsx_vilvh_w(s5, s1);
|
|
2402
|
+
__m128i u4 = __lsx_vilvl_w(s6, s2);
|
|
2403
|
+
__m128i u5 = __lsx_vilvh_w(s6, s2);
|
|
2404
|
+
__m128i u6 = __lsx_vilvl_w(s7, s3);
|
|
2405
|
+
__m128i u7 = __lsx_vilvh_w(s7, s3);
|
|
2406
|
+
__m128i u8 = __lsx_vilvl_w(sc, s8);
|
|
2407
|
+
__m128i u9 = __lsx_vilvh_w(sc, s8);
|
|
2408
|
+
__m128i ua = __lsx_vilvl_w(sd, s9);
|
|
2409
|
+
__m128i ub = __lsx_vilvh_w(sd, s9);
|
|
2410
|
+
__m128i uc = __lsx_vilvl_w(se, sa);
|
|
2411
|
+
__m128i ud = __lsx_vilvh_w(se, sa);
|
|
2412
|
+
__m128i ue = __lsx_vilvl_w(sf, sb);
|
|
2413
|
+
__m128i uf = __lsx_vilvh_w(sf, sb);
|
|
2414
|
+
|
|
2415
|
+
kernel.packet[0] = __lsx_vilvl_d(u8, u0);
|
|
2416
|
+
kernel.packet[1] = __lsx_vilvh_d(u8, u0);
|
|
2417
|
+
kernel.packet[2] = __lsx_vilvl_d(u9, u1);
|
|
2418
|
+
kernel.packet[3] = __lsx_vilvh_d(u9, u1);
|
|
2419
|
+
kernel.packet[4] = __lsx_vilvl_d(ua, u2);
|
|
2420
|
+
kernel.packet[5] = __lsx_vilvh_d(ua, u2);
|
|
2421
|
+
kernel.packet[6] = __lsx_vilvl_d(ub, u3);
|
|
2422
|
+
kernel.packet[7] = __lsx_vilvh_d(ub, u3);
|
|
2423
|
+
kernel.packet[8] = __lsx_vilvl_d(uc, u4);
|
|
2424
|
+
kernel.packet[9] = __lsx_vilvh_d(uc, u4);
|
|
2425
|
+
kernel.packet[10] = __lsx_vilvl_d(ud, u5);
|
|
2426
|
+
kernel.packet[11] = __lsx_vilvh_d(ud, u5);
|
|
2427
|
+
kernel.packet[12] = __lsx_vilvl_d(ue, u6);
|
|
2428
|
+
kernel.packet[13] = __lsx_vilvh_d(ue, u6);
|
|
2429
|
+
kernel.packet[14] = __lsx_vilvl_d(uf, u7);
|
|
2430
|
+
kernel.packet[15] = __lsx_vilvh_d(uf, u7);
|
|
2431
|
+
}
|
|
2432
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
|
|
2433
|
+
__m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
|
|
2434
|
+
__m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
|
|
2435
|
+
__m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
|
|
2436
|
+
__m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
|
|
2437
|
+
__m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
|
|
2438
|
+
__m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
|
|
2439
|
+
__m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
|
|
2440
|
+
__m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
|
|
2441
|
+
|
|
2442
|
+
__m128i s0 = __lsx_vilvl_h(t2, t0);
|
|
2443
|
+
__m128i s1 = __lsx_vilvh_h(t2, t0);
|
|
2444
|
+
__m128i s2 = __lsx_vilvl_h(t3, t1);
|
|
2445
|
+
__m128i s3 = __lsx_vilvh_h(t3, t1);
|
|
2446
|
+
__m128i s4 = __lsx_vilvl_h(t6, t4);
|
|
2447
|
+
__m128i s5 = __lsx_vilvh_h(t6, t4);
|
|
2448
|
+
__m128i s6 = __lsx_vilvl_h(t7, t5);
|
|
2449
|
+
__m128i s7 = __lsx_vilvh_h(t7, t5);
|
|
2450
|
+
|
|
2451
|
+
kernel.packet[0] = __lsx_vilvl_w(s4, s0);
|
|
2452
|
+
kernel.packet[1] = __lsx_vilvh_w(s4, s0);
|
|
2453
|
+
kernel.packet[2] = __lsx_vilvl_w(s5, s1);
|
|
2454
|
+
kernel.packet[3] = __lsx_vilvh_w(s5, s1);
|
|
2455
|
+
kernel.packet[4] = __lsx_vilvl_w(s6, s2);
|
|
2456
|
+
kernel.packet[5] = __lsx_vilvh_w(s6, s2);
|
|
2457
|
+
kernel.packet[6] = __lsx_vilvl_w(s7, s3);
|
|
2458
|
+
kernel.packet[7] = __lsx_vilvh_w(s7, s3);
|
|
2459
|
+
}
|
|
2460
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
|
|
2461
|
+
__m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
|
|
2462
|
+
__m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
|
|
2463
|
+
__m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
|
|
2464
|
+
__m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
|
|
2465
|
+
|
|
2466
|
+
kernel.packet[0] = __lsx_vilvl_h(t2, t0);
|
|
2467
|
+
kernel.packet[1] = __lsx_vilvh_h(t2, t0);
|
|
2468
|
+
kernel.packet[2] = __lsx_vilvl_h(t3, t1);
|
|
2469
|
+
kernel.packet[3] = __lsx_vilvh_h(t3, t1);
|
|
2470
|
+
}
|
|
2471
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
|
|
2472
|
+
__m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
|
|
2473
|
+
__m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
|
|
2474
|
+
__m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
|
|
2475
|
+
__m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
|
|
2476
|
+
__m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
|
|
2477
|
+
__m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
|
|
2478
|
+
__m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
|
|
2479
|
+
__m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
|
|
2480
|
+
|
|
2481
|
+
__m128i s0 = __lsx_vilvl_w(t2, t0);
|
|
2482
|
+
__m128i s1 = __lsx_vilvh_w(t2, t0);
|
|
2483
|
+
__m128i s2 = __lsx_vilvl_w(t3, t1);
|
|
2484
|
+
__m128i s3 = __lsx_vilvh_w(t3, t1);
|
|
2485
|
+
__m128i s4 = __lsx_vilvl_w(t6, t4);
|
|
2486
|
+
__m128i s5 = __lsx_vilvh_w(t6, t4);
|
|
2487
|
+
__m128i s6 = __lsx_vilvl_w(t7, t5);
|
|
2488
|
+
__m128i s7 = __lsx_vilvh_w(t7, t5);
|
|
2489
|
+
|
|
2490
|
+
kernel.packet[0] = __lsx_vilvl_d(s4, s0);
|
|
2491
|
+
kernel.packet[1] = __lsx_vilvh_d(s4, s0);
|
|
2492
|
+
kernel.packet[2] = __lsx_vilvl_d(s5, s1);
|
|
2493
|
+
kernel.packet[3] = __lsx_vilvh_d(s5, s1);
|
|
2494
|
+
kernel.packet[4] = __lsx_vilvl_d(s6, s2);
|
|
2495
|
+
kernel.packet[5] = __lsx_vilvh_d(s6, s2);
|
|
2496
|
+
kernel.packet[6] = __lsx_vilvl_d(s7, s3);
|
|
2497
|
+
kernel.packet[7] = __lsx_vilvh_d(s7, s3);
|
|
2498
|
+
}
|
|
2499
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
|
|
2500
|
+
__m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
|
|
2501
|
+
__m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
|
|
2502
|
+
__m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
|
|
2503
|
+
__m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
|
|
2504
|
+
|
|
2505
|
+
kernel.packet[0] = __lsx_vilvl_w(t2, t0);
|
|
2506
|
+
kernel.packet[1] = __lsx_vilvh_w(t2, t0);
|
|
2507
|
+
kernel.packet[2] = __lsx_vilvl_w(t3, t1);
|
|
2508
|
+
kernel.packet[3] = __lsx_vilvh_w(t3, t1);
|
|
2509
|
+
}
|
|
2510
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
|
|
2511
|
+
__m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
|
|
2512
|
+
__m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
|
|
2513
|
+
__m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
|
|
2514
|
+
__m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
|
|
2515
|
+
|
|
2516
|
+
kernel.packet[0] = __lsx_vilvl_d(T2, T0);
|
|
2517
|
+
kernel.packet[1] = __lsx_vilvh_d(T2, T0);
|
|
2518
|
+
kernel.packet[2] = __lsx_vilvl_d(T3, T1);
|
|
2519
|
+
kernel.packet[3] = __lsx_vilvh_d(T3, T1);
|
|
2520
|
+
}
|
|
2521
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
|
|
2522
|
+
__m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
|
|
2523
|
+
kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
|
|
2524
|
+
kernel.packet[1] = tmp;
|
|
2525
|
+
}
|
|
2526
|
+
|
|
2527
|
+
template <>
|
|
2528
|
+
EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
|
|
2529
|
+
return __lsx_vfrsqrt_s(a);
|
|
2530
|
+
}
|
|
2531
|
+
template <>
|
|
2532
|
+
EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
|
|
2533
|
+
return __lsx_vfrsqrt_d(a);
|
|
2534
|
+
}
|
|
2535
|
+
|
|
2536
|
+
template <>
|
|
2537
|
+
EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) {
|
|
2538
|
+
return __lsx_vfrintrm_s(a);
|
|
2539
|
+
}
|
|
2540
|
+
template <>
|
|
2541
|
+
EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) {
|
|
2542
|
+
return __lsx_vfrintrm_d(a);
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
template <>
|
|
2546
|
+
EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) {
|
|
2547
|
+
return __lsx_vfrintrp_s(a);
|
|
2548
|
+
}
|
|
2549
|
+
template <>
|
|
2550
|
+
EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) {
|
|
2551
|
+
return __lsx_vfrintrp_d(a);
|
|
2552
|
+
}
|
|
2553
|
+
|
|
2554
|
+
template <>
|
|
2555
|
+
EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) {
|
|
2556
|
+
const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u));
|
|
2557
|
+
const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
|
|
2558
|
+
return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
|
|
2559
|
+
}
|
|
2560
|
+
template <>
|
|
2561
|
+
EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) {
|
|
2562
|
+
const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
|
|
2563
|
+
const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
|
|
2564
|
+
return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
|
|
2565
|
+
}
|
|
2566
|
+
|
|
2567
|
+
template <>
|
|
2568
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
|
|
2569
|
+
return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
|
|
2570
|
+
}
|
|
2571
|
+
template <>
|
|
2572
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
|
|
2573
|
+
return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
|
|
2574
|
+
}
|
|
2575
|
+
|
|
2576
|
+
template <>
|
|
2577
|
+
EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
|
|
2578
|
+
int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
|
|
2579
|
+
*(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
|
|
2580
|
+
*(from + 3), *(from + 3), *(from + 3), *(from + 3)};
|
|
2581
|
+
return __lsx_vld(tmp, 0);
|
|
2582
|
+
}
|
|
2583
|
+
template <>
|
|
2584
|
+
EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
|
|
2585
|
+
uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
|
|
2586
|
+
*(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
|
|
2587
|
+
*(from + 3), *(from + 3), *(from + 3), *(from + 3)};
|
|
2588
|
+
return __lsx_vld(tmp, 0);
|
|
2589
|
+
}
|
|
2590
|
+
template <>
|
|
2591
|
+
EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
|
|
2592
|
+
int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
|
|
2593
|
+
return __lsx_vld(tmp, 0);
|
|
2594
|
+
}
|
|
2595
|
+
template <>
|
|
2596
|
+
EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
|
|
2597
|
+
uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
|
|
2598
|
+
return __lsx_vld(tmp, 0);
|
|
2599
|
+
}
|
|
2600
|
+
template <>
|
|
2601
|
+
EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
|
|
2602
|
+
int32_t tmp[4] = {*from, *from, *from, *from};
|
|
2603
|
+
return __lsx_vld(tmp, 0);
|
|
2604
|
+
}
|
|
2605
|
+
template <>
|
|
2606
|
+
EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
|
|
2607
|
+
uint32_t tmp[4] = {*from, *from, *from, *from};
|
|
2608
|
+
return __lsx_vld(tmp, 0);
|
|
2609
|
+
}
|
|
2610
|
+
|
|
2611
|
+
template <>
|
|
2612
|
+
EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
|
|
2613
|
+
return __lsx_vmsub_b(pnegate(c), a, b);
|
|
2614
|
+
}
|
|
2615
|
+
template <>
|
|
2616
|
+
EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
|
|
2617
|
+
return __lsx_vmsub_h(pnegate(c), a, b);
|
|
2618
|
+
}
|
|
2619
|
+
template <>
|
|
2620
|
+
EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
2621
|
+
return __lsx_vmsub_w(pnegate(c), a, b);
|
|
2622
|
+
}
|
|
2623
|
+
template <>
|
|
2624
|
+
EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
|
|
2625
|
+
return __lsx_vmsub_d(pnegate(c), a, b);
|
|
2626
|
+
}
|
|
2627
|
+
|
|
2628
|
+
template <>
|
|
2629
|
+
EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
|
|
2630
|
+
return __lsx_vmadd_b(pnegate(c), a, b);
|
|
2631
|
+
}
|
|
2632
|
+
template <>
|
|
2633
|
+
EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
|
|
2634
|
+
return __lsx_vmadd_h(pnegate(c), a, b);
|
|
2635
|
+
}
|
|
2636
|
+
template <>
|
|
2637
|
+
EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
2638
|
+
return __lsx_vmadd_w(pnegate(c), a, b);
|
|
2639
|
+
}
|
|
2640
|
+
template <>
|
|
2641
|
+
EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
|
|
2642
|
+
return __lsx_vmadd_d(pnegate(c), a, b);
|
|
2643
|
+
}
|
|
2644
|
+
|
|
2645
|
+
template <>
|
|
2646
|
+
EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
|
|
2647
|
+
return __lsx_vmsub_b(c, a, b);
|
|
2648
|
+
}
|
|
2649
|
+
template <>
|
|
2650
|
+
EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
|
|
2651
|
+
return __lsx_vmsub_h(c, a, b);
|
|
2652
|
+
}
|
|
2653
|
+
template <>
|
|
2654
|
+
EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
2655
|
+
return __lsx_vmsub_w(c, a, b);
|
|
2656
|
+
}
|
|
2657
|
+
template <>
|
|
2658
|
+
EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
|
|
2659
|
+
return __lsx_vmsub_d(c, a, b);
|
|
2660
|
+
}
|
|
2661
|
+
|
|
2662
|
+
template <>
|
|
2663
|
+
EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) {
|
|
2664
|
+
return pexp_float(_x);
|
|
2665
|
+
}
|
|
2666
|
+
template <>
|
|
2667
|
+
EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) {
|
|
2668
|
+
return pexp_double(_x);
|
|
2669
|
+
}
|
|
2670
|
+
|
|
2671
|
+
template <>
|
|
2672
|
+
EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
|
|
2673
|
+
return pldexp_generic(a, exponent);
|
|
2674
|
+
}
|
|
2675
|
+
|
|
2676
|
+
template <>
|
|
2677
|
+
EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
|
|
2678
|
+
return pfrexp_generic(a, exponent);
|
|
2679
|
+
}
|
|
2680
|
+
template <>
|
|
2681
|
+
EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
|
|
2682
|
+
return pfrexp_generic(a, exponent);
|
|
2683
|
+
}
|
|
2684
|
+
template <>
|
|
2685
|
+
EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
|
|
2686
|
+
Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
2687
|
+
return v;
|
|
2688
|
+
}
|
|
2689
|
+
template <>
|
|
2690
|
+
EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
2691
|
+
Packet4f v = psub(a, b);
|
|
2692
|
+
return pabs(v);
|
|
2693
|
+
}
|
|
2694
|
+
template <>
|
|
2695
|
+
EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
2696
|
+
return pmin<Packet4f>(a, b);
|
|
2697
|
+
}
|
|
2698
|
+
template <>
|
|
2699
|
+
EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
2700
|
+
return pmax<Packet4f>(a, b);
|
|
2701
|
+
}
|
|
2702
|
+
template <>
|
|
2703
|
+
EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
|
|
2704
|
+
return (__m128)__lsx_vldrepl_w(from, 0);
|
|
2705
|
+
}
|
|
2706
|
+
template <>
|
|
2707
|
+
EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
|
|
2708
|
+
return (__m128)__lsx_vsrai_w((__m128i)a, 31);
|
|
2709
|
+
}
|
|
2710
|
+
template <>
|
|
2711
|
+
EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
|
|
2712
|
+
return __lsx_vfrintrne_s(a);
|
|
2713
|
+
}
|
|
2714
|
+
template <>
|
|
2715
|
+
EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
|
|
2716
|
+
return __lsx_vfrintrz_s(a);
|
|
2717
|
+
}
|
|
2718
|
+
template <>
|
|
2719
|
+
EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
|
|
2720
|
+
return __lsx_vfrecip_s(a);
|
|
2721
|
+
}
|
|
2722
|
+
|
|
2723
|
+
template <>
|
|
2724
|
+
EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) {
|
|
2725
|
+
Packet2d v = {0.0, 0.0};
|
|
2726
|
+
return v;
|
|
2727
|
+
}
|
|
2728
|
+
template <>
|
|
2729
|
+
EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
2730
|
+
return pmin<Packet2d>(a, b);
|
|
2731
|
+
}
|
|
2732
|
+
template <>
|
|
2733
|
+
EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
2734
|
+
return pmax<Packet2d>(a, b);
|
|
2735
|
+
}
|
|
2736
|
+
template <>
|
|
2737
|
+
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
|
|
2738
|
+
return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
|
|
2739
|
+
}
|
|
2740
|
+
template <>
|
|
2741
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
|
|
2742
|
+
return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
|
|
2743
|
+
}
|
|
2744
|
+
template <>
|
|
2745
|
+
EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
|
|
2746
|
+
return __lsx_vfrintrne_d(a);
|
|
2747
|
+
}
|
|
2748
|
+
template <>
|
|
2749
|
+
EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
|
|
2750
|
+
return __lsx_vfrintrz_d(a);
|
|
2751
|
+
}
|
|
2752
|
+
template <>
|
|
2753
|
+
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
|
|
2754
|
+
return pldexp_generic(a, exponent);
|
|
2755
|
+
}
|
|
2756
|
+
|
|
2757
|
+
template <>
|
|
2758
|
+
EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
|
|
2759
|
+
Packet16c v = psub(a, b);
|
|
2760
|
+
return pabs(v);
|
|
2761
|
+
}
|
|
2762
|
+
|
|
2763
|
+
template <>
|
|
2764
|
+
EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
|
|
2765
|
+
Packet8s v = psub(a, b);
|
|
2766
|
+
return pabs(v);
|
|
2767
|
+
}
|
|
2768
|
+
template <>
|
|
2769
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
|
|
2770
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2771
|
+
}
|
|
2772
|
+
|
|
2773
|
+
template <>
|
|
2774
|
+
EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
2775
|
+
Packet4i v = psub(a, b);
|
|
2776
|
+
return pabs(v);
|
|
2777
|
+
}
|
|
2778
|
+
template <>
|
|
2779
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
|
|
2780
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2781
|
+
}
|
|
2782
|
+
|
|
2783
|
+
template <>
|
|
2784
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
|
|
2785
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2786
|
+
}
|
|
2787
|
+
|
|
2788
|
+
template <>
|
|
2789
|
+
EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
2790
|
+
return __lsx_vdiv_bu(a, b);
|
|
2791
|
+
}
|
|
2792
|
+
template <>
|
|
2793
|
+
EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
|
|
2794
|
+
Packet16uc v = psub(a, b);
|
|
2795
|
+
return pabs(v);
|
|
2796
|
+
}
|
|
2797
|
+
template <>
|
|
2798
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
|
|
2799
|
+
const Packet16uc& b) {
|
|
2800
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2801
|
+
}
|
|
2802
|
+
template <>
|
|
2803
|
+
EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
|
|
2804
|
+
__m128i res = {0, 0};
|
|
2805
|
+
__m128i add = {0x0808080808080808, 0x0808080808080808};
|
|
2806
|
+
for (int i = 0; i < 4; i++) {
|
|
2807
|
+
const __m128i temp = __lsx_vor_v(res, add);
|
|
2808
|
+
const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
|
|
2809
|
+
res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
|
|
2810
|
+
add = __lsx_vsrli_b(add, 1);
|
|
2811
|
+
}
|
|
2812
|
+
return res;
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
template <>
|
|
2816
|
+
EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
|
|
2817
|
+
Packet8us v = psub(a, b);
|
|
2818
|
+
return pabs(v);
|
|
2819
|
+
}
|
|
2820
|
+
template <>
|
|
2821
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
|
|
2822
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2823
|
+
}
|
|
2824
|
+
template <>
|
|
2825
|
+
EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
|
|
2826
|
+
__m128i res = {0, 0};
|
|
2827
|
+
__m128i add = {0x0080008000800080, 0x0080008000800080};
|
|
2828
|
+
for (int i = 0; i < 4; i++) {
|
|
2829
|
+
const __m128i temp = __lsx_vor_v(res, add);
|
|
2830
|
+
const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
|
|
2831
|
+
res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
|
|
2832
|
+
add = __lsx_vsrli_h(add, 1);
|
|
2833
|
+
}
|
|
2834
|
+
return res;
|
|
2835
|
+
}
|
|
2836
|
+
|
|
2837
|
+
template <>
|
|
2838
|
+
EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
|
|
2839
|
+
Packet4ui v = psub(a, b);
|
|
2840
|
+
return pabs(v);
|
|
2841
|
+
}
|
|
2842
|
+
template <>
|
|
2843
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
|
|
2844
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2845
|
+
}
|
|
2846
|
+
template <>
|
|
2847
|
+
EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
|
|
2848
|
+
__m128i res = {0, 0};
|
|
2849
|
+
__m128i add = {0x0000800000008000, 0x0000800000008000};
|
|
2850
|
+
for (int i = 0; i < 4; i++) {
|
|
2851
|
+
const __m128i temp = __lsx_vor_v(res, add);
|
|
2852
|
+
const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
|
|
2853
|
+
res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
|
|
2854
|
+
add = __lsx_vsrli_w(add, 1);
|
|
2855
|
+
}
|
|
2856
|
+
return res;
|
|
2857
|
+
}
|
|
2858
|
+
|
|
2859
|
+
template <>
|
|
2860
|
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
|
|
2861
|
+
return __lsx_vbitsel_v(b, a, mask);
|
|
2862
|
+
}
|
|
2863
|
+
|
|
2864
|
+
} // namespace internal
|
|
2865
|
+
} // namespace Eigen
|
|
2866
|
+
#endif
|