@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -0,0 +1,1088 @@
|
|
|
1
|
+
|
|
2
|
+
#ifndef EIGEN_HVX_PACKET_MATH_H
|
|
3
|
+
#define EIGEN_HVX_PACKET_MATH_H
|
|
4
|
+
|
|
5
|
+
// Only support 128B HVX now.
|
|
6
|
+
// Floating-point operations are supported only since V68.
|
|
7
|
+
#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
|
|
8
|
+
|
|
9
|
+
// All the floating-point operations do not support IEEE standard.
|
|
10
|
+
// From HVX document:
|
|
11
|
+
// There is no concept of infinity or NaN. QFloat saturates to maximum
|
|
12
|
+
// exponent with maximum positive or minimum negative significand.
|
|
13
|
+
|
|
14
|
+
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
15
|
+
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
namespace Eigen {
|
|
19
|
+
namespace internal {
|
|
20
|
+
|
|
21
|
+
// HVX utilities.
|
|
22
|
+
|
|
23
|
+
template <int D>
|
|
24
|
+
EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
|
|
25
|
+
HVX_Vector v;
|
|
26
|
+
#if EIGEN_COMP_CLANG
|
|
27
|
+
// Use inlined assembly for aligned vmem load on unaligned memory.
|
|
28
|
+
// Use type cast to HVX_Vector* may mess up with compiler data alignment.
|
|
29
|
+
__asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
|
|
30
|
+
#else
|
|
31
|
+
void* aligned_mem =
|
|
32
|
+
reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
|
|
33
|
+
memcpy(&v, aligned_mem, __HVX_LENGTH__);
|
|
34
|
+
#endif
|
|
35
|
+
return v;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
template <typename T>
|
|
39
|
+
EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
|
|
40
|
+
HVX_Vector v;
|
|
41
|
+
memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
|
|
42
|
+
return v;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
template <typename T>
|
|
46
|
+
EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
|
|
47
|
+
HVX_Vector v;
|
|
48
|
+
memcpy(&v, mem, __HVX_LENGTH__);
|
|
49
|
+
return v;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
template <size_t Size, size_t Alignment, typename T>
|
|
53
|
+
EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
|
|
54
|
+
#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
|
|
55
|
+
// Fast partial vector load through aligned vmem load.
|
|
56
|
+
// The load may past end of array but is aligned to prevent memory fault.
|
|
57
|
+
HVX_Vector v0 = HVX_vmem<0>(mem);
|
|
58
|
+
HVX_Vector v1 = v0;
|
|
59
|
+
uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
|
|
60
|
+
EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
|
|
61
|
+
// Data size less than alignment will never cross multiple aligned vectors.
|
|
62
|
+
v1 = v0;
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
|
|
66
|
+
if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
|
|
67
|
+
v1 = HVX_vmem<1>(mem);
|
|
68
|
+
} else {
|
|
69
|
+
v1 = v0;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return Q6_V_valign_VVR(v1, v0, mem_addr);
|
|
73
|
+
#else
|
|
74
|
+
HVX_Vector v;
|
|
75
|
+
memcpy(&v, mem, Size * sizeof(T));
|
|
76
|
+
return v;
|
|
77
|
+
#endif
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
template <typename T>
|
|
81
|
+
EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
|
|
82
|
+
memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
template <typename T>
|
|
86
|
+
EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
|
|
87
|
+
memcpy(mem, &v, __HVX_LENGTH__);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
template <size_t Size, size_t Alignment, typename T>
|
|
91
|
+
EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
|
|
92
|
+
uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
|
|
93
|
+
HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
|
|
94
|
+
uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
|
|
95
|
+
uintptr_t right_off = left_off + Size * sizeof(T);
|
|
96
|
+
|
|
97
|
+
HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
|
|
98
|
+
HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
|
|
99
|
+
|
|
100
|
+
EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
|
|
101
|
+
if (right_off > __HVX_LENGTH__) {
|
|
102
|
+
Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
|
|
103
|
+
qr = Q6_Q_vcmp_eq_VbVb(value, value);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
ql_not = Q6_Q_or_QQn(ql_not, qr);
|
|
108
|
+
Q6_vmem_QnRIV(ql_not, mem, value);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Packet definitions.
|
|
112
|
+
enum class HVXPacketSize {
|
|
113
|
+
Full,
|
|
114
|
+
Half,
|
|
115
|
+
Quarter,
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
// Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
|
|
119
|
+
// Wrap different vector type (float32, int32, etc) to different class with
|
|
120
|
+
// explicit constructor and casting back-and-force to HVX_Vector.
|
|
121
|
+
template <HVXPacketSize T>
|
|
122
|
+
class HVXPacket {
|
|
123
|
+
public:
|
|
124
|
+
HVXPacket() = default;
|
|
125
|
+
static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
|
|
126
|
+
HVX_Vector Get() const { return m_val; }
|
|
127
|
+
|
|
128
|
+
private:
|
|
129
|
+
explicit HVXPacket(HVX_Vector v) : m_val(v) {}
|
|
130
|
+
HVX_Vector m_val = Q6_V_vzero();
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
typedef HVXPacket<HVXPacketSize::Full> Packet32f;
|
|
134
|
+
typedef HVXPacket<HVXPacketSize::Half> Packet16f;
|
|
135
|
+
typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
|
|
136
|
+
|
|
137
|
+
// Packet traits.
|
|
138
|
+
template <>
|
|
139
|
+
struct packet_traits<float> : default_packet_traits {
|
|
140
|
+
typedef Packet32f type;
|
|
141
|
+
typedef Packet16f half;
|
|
142
|
+
enum {
|
|
143
|
+
Vectorizable = 1,
|
|
144
|
+
AlignedOnScalar = 1,
|
|
145
|
+
size = 32,
|
|
146
|
+
|
|
147
|
+
HasCmp = 1,
|
|
148
|
+
HasAdd = 1,
|
|
149
|
+
HasSub = 1,
|
|
150
|
+
HasShift = 0,
|
|
151
|
+
HasMul = 1,
|
|
152
|
+
HasNegate = 1,
|
|
153
|
+
HasAbs = 1,
|
|
154
|
+
HasArg = 0,
|
|
155
|
+
HasAbs2 = 0,
|
|
156
|
+
HasAbsDiff = 0,
|
|
157
|
+
HasMin = 1,
|
|
158
|
+
HasMax = 1,
|
|
159
|
+
HasConj = 0,
|
|
160
|
+
HasSetLinear = 0,
|
|
161
|
+
HasBlend = 0,
|
|
162
|
+
|
|
163
|
+
HasDiv = 0,
|
|
164
|
+
|
|
165
|
+
HasSin = 0,
|
|
166
|
+
HasCos = 0,
|
|
167
|
+
HasACos = 0,
|
|
168
|
+
HasASin = 0,
|
|
169
|
+
HasATan = 0,
|
|
170
|
+
HasATanh = 0,
|
|
171
|
+
HasLog = 0,
|
|
172
|
+
HasExp = 0,
|
|
173
|
+
HasSqrt = 0,
|
|
174
|
+
HasRsqrt = 0,
|
|
175
|
+
HasTanh = 0,
|
|
176
|
+
HasErf = 0,
|
|
177
|
+
HasBessel = 0,
|
|
178
|
+
HasNdtri = 0
|
|
179
|
+
};
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
template <>
|
|
183
|
+
struct unpacket_traits<Packet32f> {
|
|
184
|
+
typedef float type;
|
|
185
|
+
typedef Packet16f half;
|
|
186
|
+
enum {
|
|
187
|
+
size = 32,
|
|
188
|
+
alignment = Aligned128,
|
|
189
|
+
vectorizable = true,
|
|
190
|
+
masked_load_available = false,
|
|
191
|
+
masked_store_available = false
|
|
192
|
+
};
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
template <>
|
|
196
|
+
struct unpacket_traits<Packet16f> {
|
|
197
|
+
typedef float type;
|
|
198
|
+
typedef Packet8f half;
|
|
199
|
+
enum {
|
|
200
|
+
size = 16,
|
|
201
|
+
// Many code assume alignment on packet size instead of following trait
|
|
202
|
+
// So we do not use Aligned128 to optimize aligned load/store,
|
|
203
|
+
alignment = Aligned64,
|
|
204
|
+
vectorizable = true,
|
|
205
|
+
masked_load_available = false,
|
|
206
|
+
masked_store_available = false
|
|
207
|
+
};
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
template <>
|
|
211
|
+
struct unpacket_traits<Packet8f> {
|
|
212
|
+
typedef float type;
|
|
213
|
+
typedef Packet8f half;
|
|
214
|
+
enum {
|
|
215
|
+
size = 8,
|
|
216
|
+
// Many code assume alignment on packet size instead of following trait
|
|
217
|
+
// So we do not use Aligned128 to optimize aligned load/store,
|
|
218
|
+
alignment = Aligned32,
|
|
219
|
+
vectorizable = true,
|
|
220
|
+
masked_load_available = false,
|
|
221
|
+
masked_store_available = false
|
|
222
|
+
};
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
// float32 operations.
|
|
226
|
+
template <HVXPacketSize T>
|
|
227
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
|
|
228
|
+
return HVXPacket<T>::Create(Q6_V_vzero());
|
|
229
|
+
}
|
|
230
|
+
template <>
|
|
231
|
+
EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
|
|
232
|
+
return pzero_hvx(Packet32f());
|
|
233
|
+
}
|
|
234
|
+
template <>
|
|
235
|
+
EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
|
|
236
|
+
return pzero_hvx(Packet16f());
|
|
237
|
+
}
|
|
238
|
+
template <>
|
|
239
|
+
EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
|
|
240
|
+
return pzero_hvx(Packet8f());
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
template <HVXPacketSize T>
|
|
244
|
+
EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
|
|
245
|
+
const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
|
|
246
|
+
return unpacket_traits<HVXPacket<T>>::half::Create(
|
|
247
|
+
Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
|
|
248
|
+
}
|
|
249
|
+
template <>
|
|
250
|
+
EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
|
|
251
|
+
return predux_half_dowto4_hvx(a);
|
|
252
|
+
}
|
|
253
|
+
template <>
|
|
254
|
+
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
|
|
255
|
+
return predux_half_dowto4_hvx(a);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
template <HVXPacketSize T>
|
|
259
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
|
|
260
|
+
union {
|
|
261
|
+
float f;
|
|
262
|
+
int32_t i;
|
|
263
|
+
} u;
|
|
264
|
+
u.f = from;
|
|
265
|
+
return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
|
|
266
|
+
}
|
|
267
|
+
template <>
|
|
268
|
+
EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
|
|
269
|
+
return pset1_hvx<HVXPacketSize::Full>(from);
|
|
270
|
+
}
|
|
271
|
+
template <>
|
|
272
|
+
EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
|
|
273
|
+
return pset1_hvx<HVXPacketSize::Half>(from);
|
|
274
|
+
}
|
|
275
|
+
template <>
|
|
276
|
+
EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
|
|
277
|
+
return pset1_hvx<HVXPacketSize::Quarter>(from);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
template <>
|
|
281
|
+
EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
|
|
282
|
+
return Packet32f::Create(HVX_load(from));
|
|
283
|
+
}
|
|
284
|
+
template <>
|
|
285
|
+
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
|
|
286
|
+
return Packet16f::Create(
|
|
287
|
+
HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
|
|
288
|
+
}
|
|
289
|
+
template <>
|
|
290
|
+
EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
|
|
291
|
+
return Packet8f::Create(
|
|
292
|
+
HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
template <>
|
|
296
|
+
EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
|
|
297
|
+
return Packet32f::Create(HVX_loadu(from));
|
|
298
|
+
}
|
|
299
|
+
template <>
|
|
300
|
+
EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
|
|
301
|
+
return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
|
|
302
|
+
}
|
|
303
|
+
template <>
|
|
304
|
+
EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
|
|
305
|
+
return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
template <>
|
|
309
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
|
|
310
|
+
HVX_store(to, from.Get());
|
|
311
|
+
}
|
|
312
|
+
template <>
|
|
313
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
|
|
314
|
+
HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
|
|
315
|
+
}
|
|
316
|
+
template <>
|
|
317
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
|
|
318
|
+
HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
template <>
|
|
322
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
|
|
323
|
+
HVX_storeu(to, from.Get());
|
|
324
|
+
}
|
|
325
|
+
template <>
|
|
326
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
|
|
327
|
+
HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
|
|
328
|
+
}
|
|
329
|
+
template <>
|
|
330
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
|
|
331
|
+
HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
template <HVXPacketSize T>
|
|
335
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
336
|
+
return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
|
|
337
|
+
}
|
|
338
|
+
template <>
|
|
339
|
+
EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
|
|
340
|
+
return pmul_hvx(a, b);
|
|
341
|
+
}
|
|
342
|
+
template <>
|
|
343
|
+
EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
344
|
+
return pmul_hvx(a, b);
|
|
345
|
+
}
|
|
346
|
+
template <>
|
|
347
|
+
EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
|
|
348
|
+
return pmul_hvx(a, b);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
template <HVXPacketSize T>
|
|
352
|
+
EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
353
|
+
return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
|
|
354
|
+
}
|
|
355
|
+
template <>
|
|
356
|
+
EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
|
|
357
|
+
return padd_hvx(a, b);
|
|
358
|
+
}
|
|
359
|
+
template <>
|
|
360
|
+
EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
361
|
+
return padd_hvx(a, b);
|
|
362
|
+
}
|
|
363
|
+
template <>
|
|
364
|
+
EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
|
|
365
|
+
return padd_hvx(a, b);
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
template <HVXPacketSize T>
|
|
369
|
+
EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
370
|
+
return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
|
|
371
|
+
}
|
|
372
|
+
template <>
|
|
373
|
+
EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
|
|
374
|
+
return psub_hvx(a, b);
|
|
375
|
+
}
|
|
376
|
+
template <>
|
|
377
|
+
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
|
|
378
|
+
return psub_hvx(a, b);
|
|
379
|
+
}
|
|
380
|
+
template <>
|
|
381
|
+
EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
|
|
382
|
+
return psub_hvx(a, b);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
template <HVXPacketSize T>
|
|
386
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
|
|
387
|
+
return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
|
|
388
|
+
}
|
|
389
|
+
template <>
|
|
390
|
+
EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
|
|
391
|
+
return pnegate_hvx(a);
|
|
392
|
+
}
|
|
393
|
+
template <>
|
|
394
|
+
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
|
|
395
|
+
return pnegate_hvx(a);
|
|
396
|
+
}
|
|
397
|
+
template <>
|
|
398
|
+
EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
|
|
399
|
+
return pnegate_hvx(a);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
template <HVXPacketSize T>
|
|
403
|
+
EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
|
|
404
|
+
return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
|
|
405
|
+
}
|
|
406
|
+
template <>
|
|
407
|
+
EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
|
|
408
|
+
return ptrue_hvx(a);
|
|
409
|
+
}
|
|
410
|
+
template <>
|
|
411
|
+
EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
|
|
412
|
+
return ptrue_hvx(a);
|
|
413
|
+
}
|
|
414
|
+
template <>
|
|
415
|
+
EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
|
|
416
|
+
return ptrue_hvx(a);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
template <HVXPacketSize T>
|
|
420
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
421
|
+
HVX_Vector v_true = ptrue(a).Get();
|
|
422
|
+
HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
|
|
423
|
+
return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
|
|
424
|
+
}
|
|
425
|
+
template <>
|
|
426
|
+
EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
|
|
427
|
+
return pcmp_le_hvx(a, b);
|
|
428
|
+
}
|
|
429
|
+
template <>
|
|
430
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
|
|
431
|
+
return pcmp_le_hvx(a, b);
|
|
432
|
+
}
|
|
433
|
+
template <>
|
|
434
|
+
EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
|
|
435
|
+
return pcmp_le_hvx(a, b);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
template <HVXPacketSize T>
|
|
439
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
440
|
+
HVX_Vector v_true = ptrue(a).Get();
|
|
441
|
+
HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
|
|
442
|
+
return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
|
|
443
|
+
}
|
|
444
|
+
template <>
|
|
445
|
+
EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
|
|
446
|
+
return pcmp_eq_hvx(a, b);
|
|
447
|
+
}
|
|
448
|
+
template <>
|
|
449
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
|
|
450
|
+
return pcmp_eq_hvx(a, b);
|
|
451
|
+
}
|
|
452
|
+
template <>
|
|
453
|
+
EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
|
|
454
|
+
return pcmp_eq_hvx(a, b);
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
template <HVXPacketSize T>
|
|
458
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
459
|
+
HVX_Vector v_true = ptrue(a).Get();
|
|
460
|
+
HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
|
|
461
|
+
return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
|
|
462
|
+
}
|
|
463
|
+
template <>
|
|
464
|
+
EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
|
|
465
|
+
return pcmp_lt_hvx(a, b);
|
|
466
|
+
}
|
|
467
|
+
template <>
|
|
468
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
|
|
469
|
+
return pcmp_lt_hvx(a, b);
|
|
470
|
+
}
|
|
471
|
+
template <>
|
|
472
|
+
EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
|
|
473
|
+
return pcmp_lt_hvx(a, b);
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
template <HVXPacketSize T>
|
|
477
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
478
|
+
HVX_Vector v_true = ptrue(a).Get();
|
|
479
|
+
HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
|
|
480
|
+
return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
|
|
481
|
+
}
|
|
482
|
+
template <>
|
|
483
|
+
EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
|
|
484
|
+
return pcmp_lt_or_nan_hvx(a, b);
|
|
485
|
+
}
|
|
486
|
+
template <>
|
|
487
|
+
EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
|
|
488
|
+
return pcmp_lt_or_nan_hvx(a, b);
|
|
489
|
+
}
|
|
490
|
+
template <>
|
|
491
|
+
EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
|
|
492
|
+
return pcmp_lt_or_nan_hvx(a, b);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
template <HVXPacketSize T>
|
|
496
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
|
|
497
|
+
return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
|
|
498
|
+
}
|
|
499
|
+
template <>
|
|
500
|
+
EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
|
|
501
|
+
return pabs_hvx(a);
|
|
502
|
+
}
|
|
503
|
+
template <>
|
|
504
|
+
EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
|
|
505
|
+
return pabs_hvx(a);
|
|
506
|
+
}
|
|
507
|
+
template <>
|
|
508
|
+
EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
|
|
509
|
+
return pabs_hvx(a);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
template <HVXPacketSize T>
|
|
513
|
+
EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
|
|
514
|
+
union {
|
|
515
|
+
float array[1];
|
|
516
|
+
HVX_Vector vector;
|
|
517
|
+
} HVX_and_array;
|
|
518
|
+
HVX_and_array.vector = a.Get();
|
|
519
|
+
return HVX_and_array.array[0];
|
|
520
|
+
}
|
|
521
|
+
template <>
|
|
522
|
+
EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
|
|
523
|
+
return pfirst_hvx(a);
|
|
524
|
+
}
|
|
525
|
+
template <>
|
|
526
|
+
EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
|
|
527
|
+
return pfirst_hvx(a);
|
|
528
|
+
}
|
|
529
|
+
template <>
|
|
530
|
+
EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
|
|
531
|
+
return pfirst_hvx(a);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
|
|
535
|
+
// Shuffle the 32-bit lanes.
|
|
536
|
+
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
|
|
537
|
+
HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
|
|
538
|
+
|
|
539
|
+
// Shuffle the 64-bit lanes.
|
|
540
|
+
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
|
|
541
|
+
HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
|
|
542
|
+
kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
|
|
543
|
+
kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
|
|
544
|
+
kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
|
|
545
|
+
kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
|
|
546
|
+
}
|
|
547
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
|
|
548
|
+
// Shuffle the 32-bit lanes.
|
|
549
|
+
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
|
|
550
|
+
HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
|
|
551
|
+
|
|
552
|
+
// Shuffle the 64-bit lanes.
|
|
553
|
+
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
|
|
554
|
+
|
|
555
|
+
kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
|
|
556
|
+
kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
|
|
557
|
+
kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
|
|
558
|
+
kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
|
|
559
|
+
}
|
|
560
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
|
|
561
|
+
// Shuffle the 32-bit lanes.
|
|
562
|
+
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
|
|
563
|
+
HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
|
|
564
|
+
|
|
565
|
+
// Shuffle the 64-bit lanes.
|
|
566
|
+
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
|
|
567
|
+
|
|
568
|
+
kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
|
|
569
|
+
kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
|
|
570
|
+
kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
|
|
571
|
+
kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
|
|
575
|
+
// Shuffle the 32-bit lanes.
|
|
576
|
+
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
|
|
577
|
+
HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
|
|
578
|
+
HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
|
|
579
|
+
HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
|
|
580
|
+
|
|
581
|
+
// Shuffle the 64-bit lanes.
|
|
582
|
+
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
|
|
583
|
+
HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
|
|
584
|
+
|
|
585
|
+
// Shuffle the 128-bit lanes.
|
|
586
|
+
v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
|
|
587
|
+
|
|
588
|
+
kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
|
|
589
|
+
kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
|
|
590
|
+
kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
|
|
591
|
+
kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
|
|
592
|
+
kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
|
|
593
|
+
kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
|
|
594
|
+
kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
|
|
595
|
+
kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
|
|
596
|
+
}
|
|
597
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
|
|
598
|
+
// Shuffle the 32-bit lanes.
|
|
599
|
+
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
|
|
600
|
+
HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
|
|
601
|
+
HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
|
|
602
|
+
HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
|
|
603
|
+
HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
|
|
604
|
+
HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
|
|
605
|
+
HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
|
|
606
|
+
HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
|
|
607
|
+
|
|
608
|
+
// Shuffle the 64-bit lanes.
|
|
609
|
+
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
|
|
610
|
+
HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
|
|
611
|
+
HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
|
|
612
|
+
HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
|
|
613
|
+
|
|
614
|
+
// Shuffle the 128-bit lanes.
|
|
615
|
+
v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
|
|
616
|
+
v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
|
|
617
|
+
v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
|
|
618
|
+
v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
|
|
619
|
+
|
|
620
|
+
// Shuffle the 256-bit lanes.
|
|
621
|
+
v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
|
|
622
|
+
v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
|
|
623
|
+
v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
|
|
624
|
+
v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
|
|
625
|
+
|
|
626
|
+
kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
|
|
627
|
+
kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
|
|
628
|
+
kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
|
|
629
|
+
kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
|
|
630
|
+
kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
|
|
631
|
+
kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
|
|
632
|
+
kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
|
|
633
|
+
kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
|
|
634
|
+
kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
|
|
635
|
+
kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
|
|
636
|
+
kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
|
|
637
|
+
kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
|
|
638
|
+
kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
|
|
639
|
+
kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
|
|
640
|
+
kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
|
|
641
|
+
kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
|
|
642
|
+
}
|
|
643
|
+
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
|
|
644
|
+
// Shuffle the 32-bit lanes.
|
|
645
|
+
HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
|
|
646
|
+
HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
|
|
647
|
+
HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
|
|
648
|
+
HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
|
|
649
|
+
HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
|
|
650
|
+
HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
|
|
651
|
+
HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
|
|
652
|
+
HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
|
|
653
|
+
HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
|
|
654
|
+
HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
|
|
655
|
+
HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
|
|
656
|
+
HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
|
|
657
|
+
HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
|
|
658
|
+
HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
|
|
659
|
+
HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
|
|
660
|
+
HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
|
|
661
|
+
|
|
662
|
+
// Shuffle the 64-bit lanes.
|
|
663
|
+
HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
|
|
664
|
+
HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
|
|
665
|
+
HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
|
|
666
|
+
HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
|
|
667
|
+
HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
|
|
668
|
+
HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
|
|
669
|
+
HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
|
|
670
|
+
HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
|
|
671
|
+
HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
|
|
672
|
+
HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
|
|
673
|
+
HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
|
|
674
|
+
HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
|
|
675
|
+
HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
|
|
676
|
+
HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
|
|
677
|
+
HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
|
|
678
|
+
HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
|
|
679
|
+
|
|
680
|
+
// Shuffle the 128-bit lanes.
|
|
681
|
+
v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
|
|
682
|
+
v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
|
|
683
|
+
v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
|
|
684
|
+
v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
|
|
685
|
+
v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
|
|
686
|
+
v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
|
|
687
|
+
v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
|
|
688
|
+
v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
|
|
689
|
+
v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
|
|
690
|
+
v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
|
|
691
|
+
v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
|
|
692
|
+
v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
|
|
693
|
+
v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
|
|
694
|
+
v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
|
|
695
|
+
v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
|
|
696
|
+
v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
|
|
697
|
+
|
|
698
|
+
// Shuffle the 256-bit lanes.
|
|
699
|
+
v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
|
|
700
|
+
v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
|
|
701
|
+
v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
|
|
702
|
+
v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
|
|
703
|
+
v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
|
|
704
|
+
v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
|
|
705
|
+
v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
|
|
706
|
+
v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
|
|
707
|
+
v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
|
|
708
|
+
v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
|
|
709
|
+
v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
|
|
710
|
+
v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
|
|
711
|
+
v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
|
|
712
|
+
v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
|
|
713
|
+
v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
|
|
714
|
+
v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
|
|
715
|
+
|
|
716
|
+
// Shuffle the 512-bit lanes.
|
|
717
|
+
v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
|
|
718
|
+
v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
|
|
719
|
+
v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
|
|
720
|
+
v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
|
|
721
|
+
v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
|
|
722
|
+
v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
|
|
723
|
+
v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
|
|
724
|
+
v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
|
|
725
|
+
v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
|
|
726
|
+
v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
|
|
727
|
+
v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
|
|
728
|
+
v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
|
|
729
|
+
v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
|
|
730
|
+
v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
|
|
731
|
+
v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
|
|
732
|
+
v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
|
|
733
|
+
|
|
734
|
+
kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
|
|
735
|
+
kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
|
|
736
|
+
kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
|
|
737
|
+
kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
|
|
738
|
+
kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
|
|
739
|
+
kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
|
|
740
|
+
kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
|
|
741
|
+
kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
|
|
742
|
+
kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
|
|
743
|
+
kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
|
|
744
|
+
kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
|
|
745
|
+
kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
|
|
746
|
+
kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
|
|
747
|
+
kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
|
|
748
|
+
kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
|
|
749
|
+
kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
|
|
750
|
+
kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
|
|
751
|
+
kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
|
|
752
|
+
kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
|
|
753
|
+
kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
|
|
754
|
+
kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
|
|
755
|
+
kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
|
|
756
|
+
kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
|
|
757
|
+
kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
|
|
758
|
+
kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
|
|
759
|
+
kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
|
|
760
|
+
kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
|
|
761
|
+
kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
|
|
762
|
+
kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
|
|
763
|
+
kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
|
|
764
|
+
kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
|
|
765
|
+
kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
template <HVXPacketSize T>
|
|
769
|
+
EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
|
|
770
|
+
const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
|
|
771
|
+
HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
|
|
772
|
+
for (int i = 2; i < packet_size; i <<= 1) {
|
|
773
|
+
vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
|
|
774
|
+
}
|
|
775
|
+
return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
|
|
776
|
+
}
|
|
777
|
+
template <>
|
|
778
|
+
EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
|
|
779
|
+
return predux_hvx(a);
|
|
780
|
+
}
|
|
781
|
+
template <>
|
|
782
|
+
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
|
|
783
|
+
return predux_hvx(a);
|
|
784
|
+
}
|
|
785
|
+
template <>
|
|
786
|
+
EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
|
|
787
|
+
return predux_hvx(a);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
template <HVXPacketSize T>
|
|
791
|
+
EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
|
|
792
|
+
constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
|
|
793
|
+
HVX_Vector load = HVX_load_partial<size, 0>(from);
|
|
794
|
+
HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
|
|
795
|
+
return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
|
|
796
|
+
}
|
|
797
|
+
template <>
|
|
798
|
+
EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
|
|
799
|
+
return ploaddup_hvx<HVXPacketSize::Full>(from);
|
|
800
|
+
}
|
|
801
|
+
template <>
|
|
802
|
+
EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
|
|
803
|
+
return ploaddup_hvx<HVXPacketSize::Half>(from);
|
|
804
|
+
}
|
|
805
|
+
template <>
|
|
806
|
+
EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
|
|
807
|
+
return ploaddup_hvx<HVXPacketSize::Quarter>(from);
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
template <HVXPacketSize T>
|
|
811
|
+
EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
|
|
812
|
+
constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
|
|
813
|
+
HVX_Vector load = HVX_load_partial<size, 0>(from);
|
|
814
|
+
HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
|
|
815
|
+
HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
|
|
816
|
+
return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
|
|
817
|
+
}
|
|
818
|
+
template <>
|
|
819
|
+
EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
|
|
820
|
+
return ploadquad_hvx<HVXPacketSize::Full>(from);
|
|
821
|
+
}
|
|
822
|
+
template <>
|
|
823
|
+
EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
|
|
824
|
+
return ploadquad_hvx<HVXPacketSize::Half>(from);
|
|
825
|
+
}
|
|
826
|
+
template <>
|
|
827
|
+
EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
|
|
828
|
+
return ploadquad_hvx<HVXPacketSize::Quarter>(from);
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
template <>
|
|
832
|
+
EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
|
|
833
|
+
HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
|
|
834
|
+
return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
template <>
|
|
838
|
+
EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
|
|
839
|
+
HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
|
|
840
|
+
return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
template <>
|
|
844
|
+
EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
|
|
845
|
+
HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
|
|
846
|
+
return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
template <HVXPacketSize T>
|
|
850
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
851
|
+
return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
|
|
852
|
+
}
|
|
853
|
+
template <>
|
|
854
|
+
EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
|
|
855
|
+
return pmin_hvx(a, b);
|
|
856
|
+
}
|
|
857
|
+
template <>
|
|
858
|
+
EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
|
|
859
|
+
return pmin_hvx(a, b);
|
|
860
|
+
}
|
|
861
|
+
template <>
|
|
862
|
+
EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
|
|
863
|
+
return pmin_hvx(a, b);
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
template <HVXPacketSize T>
|
|
867
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
868
|
+
return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
|
|
869
|
+
}
|
|
870
|
+
template <>
|
|
871
|
+
EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
|
|
872
|
+
return pmax_hvx(a, b);
|
|
873
|
+
}
|
|
874
|
+
template <>
|
|
875
|
+
EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
|
|
876
|
+
return pmax_hvx(a, b);
|
|
877
|
+
}
|
|
878
|
+
template <>
|
|
879
|
+
EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
|
|
880
|
+
return pmax_hvx(a, b);
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
template <HVXPacketSize T>
|
|
884
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
885
|
+
return HVXPacket<T>::Create(a.Get() & b.Get());
|
|
886
|
+
}
|
|
887
|
+
template <>
|
|
888
|
+
EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
|
|
889
|
+
return pand_hvx(a, b);
|
|
890
|
+
}
|
|
891
|
+
template <>
|
|
892
|
+
EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
|
|
893
|
+
return pand_hvx(a, b);
|
|
894
|
+
}
|
|
895
|
+
template <>
|
|
896
|
+
EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
|
|
897
|
+
return pand_hvx(a, b);
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
template <HVXPacketSize T>
|
|
901
|
+
EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
902
|
+
return HVXPacket<T>::Create(a.Get() | b.Get());
|
|
903
|
+
}
|
|
904
|
+
template <>
|
|
905
|
+
EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
|
|
906
|
+
return por_hvx(a, b);
|
|
907
|
+
}
|
|
908
|
+
template <>
|
|
909
|
+
EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
|
|
910
|
+
return por_hvx(a, b);
|
|
911
|
+
}
|
|
912
|
+
template <>
|
|
913
|
+
EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
|
|
914
|
+
return por_hvx(a, b);
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
template <HVXPacketSize T>
|
|
918
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
919
|
+
return HVXPacket<T>::Create(a.Get() ^ b.Get());
|
|
920
|
+
}
|
|
921
|
+
template <>
|
|
922
|
+
EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
|
|
923
|
+
return pxor_hvx(a, b);
|
|
924
|
+
}
|
|
925
|
+
template <>
|
|
926
|
+
EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
|
|
927
|
+
return pxor_hvx(a, b);
|
|
928
|
+
}
|
|
929
|
+
template <>
|
|
930
|
+
EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
|
|
931
|
+
return pxor_hvx(a, b);
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
template <HVXPacketSize T>
|
|
935
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
|
|
936
|
+
return HVXPacket<T>::Create(~a.Get());
|
|
937
|
+
}
|
|
938
|
+
template <>
|
|
939
|
+
EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
|
|
940
|
+
return pnot_hvx(a);
|
|
941
|
+
}
|
|
942
|
+
template <>
|
|
943
|
+
EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
|
|
944
|
+
return pnot_hvx(a);
|
|
945
|
+
}
|
|
946
|
+
template <>
|
|
947
|
+
EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
|
|
948
|
+
return pnot_hvx(a);
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
template <HVXPacketSize T>
|
|
952
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
|
|
953
|
+
HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
|
|
954
|
+
return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
|
|
955
|
+
}
|
|
956
|
+
template <>
|
|
957
|
+
EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
|
|
958
|
+
return pselect_hvx(mask, a, b);
|
|
959
|
+
}
|
|
960
|
+
template <>
|
|
961
|
+
EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
|
|
962
|
+
return pselect_hvx(mask, a, b);
|
|
963
|
+
}
|
|
964
|
+
template <>
|
|
965
|
+
EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
|
|
966
|
+
return pselect_hvx(mask, a, b);
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
template <HVXPacketSize T, typename Op>
|
|
970
|
+
EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
|
|
971
|
+
const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
|
|
972
|
+
HVXPacket<T> vredux = a;
|
|
973
|
+
for (int i = 1; i < packet_size; i <<= 1) {
|
|
974
|
+
vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
|
|
975
|
+
}
|
|
976
|
+
return pfirst(vredux);
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
template <>
|
|
980
|
+
EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
|
|
981
|
+
return predux_generic(a, pmax<Packet32f>);
|
|
982
|
+
}
|
|
983
|
+
template <>
|
|
984
|
+
EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
|
|
985
|
+
return predux_generic(a, pmax<Packet16f>);
|
|
986
|
+
}
|
|
987
|
+
template <>
|
|
988
|
+
EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
|
|
989
|
+
return predux_generic(a, pmax<Packet8f>);
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
template <>
|
|
993
|
+
EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
|
|
994
|
+
return predux_generic(a, pmin<Packet32f>);
|
|
995
|
+
}
|
|
996
|
+
template <>
|
|
997
|
+
EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
|
|
998
|
+
return predux_generic(a, pmin<Packet16f>);
|
|
999
|
+
}
|
|
1000
|
+
template <>
|
|
1001
|
+
EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
|
|
1002
|
+
return predux_generic(a, pmin<Packet8f>);
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
template <>
|
|
1006
|
+
EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
|
|
1007
|
+
return predux_generic(a, por<Packet32f>) != 0.0f;
|
|
1008
|
+
}
|
|
1009
|
+
template <>
|
|
1010
|
+
EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
|
|
1011
|
+
return predux_generic(a, por<Packet16f>) != 0.0f;
|
|
1012
|
+
}
|
|
1013
|
+
template <>
|
|
1014
|
+
EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
|
|
1015
|
+
return predux_generic(a, por<Packet8f>) != 0.0f;
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
static const float index_vsf[32]
|
|
1019
|
+
__attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
1020
|
+
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
1021
|
+
|
|
1022
|
+
template <HVXPacketSize T>
|
|
1023
|
+
EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
|
|
1024
|
+
return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
|
|
1025
|
+
}
|
|
1026
|
+
template <>
|
|
1027
|
+
EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
|
|
1028
|
+
return plset_hvx<HVXPacketSize::Full>(a);
|
|
1029
|
+
}
|
|
1030
|
+
template <>
|
|
1031
|
+
EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
|
|
1032
|
+
return plset_hvx<HVXPacketSize::Half>(a);
|
|
1033
|
+
}
|
|
1034
|
+
template <>
|
|
1035
|
+
EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
|
|
1036
|
+
return plset_hvx<HVXPacketSize::Quarter>(a);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
template <HVXPacketSize T>
|
|
1040
|
+
EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
|
|
1041
|
+
const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
|
|
1042
|
+
float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
|
|
1043
|
+
pstore<float>(elements, from);
|
|
1044
|
+
for (Index i = 0; i < packet_size; ++i) {
|
|
1045
|
+
to[i * stride] = elements[i];
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
template <>
|
|
1049
|
+
EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
|
|
1050
|
+
pscatter_hvx(to, from, stride);
|
|
1051
|
+
}
|
|
1052
|
+
template <>
|
|
1053
|
+
EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
|
|
1054
|
+
pscatter_hvx(to, from, stride);
|
|
1055
|
+
}
|
|
1056
|
+
template <>
|
|
1057
|
+
EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
|
|
1058
|
+
pscatter_hvx(to, from, stride);
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
template <HVXPacketSize T>
|
|
1062
|
+
EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
|
|
1063
|
+
const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
|
|
1064
|
+
float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
|
|
1065
|
+
for (Index i = 0; i < packet_size; i++) {
|
|
1066
|
+
elements[i] = from[i * stride];
|
|
1067
|
+
}
|
|
1068
|
+
return pload<HVXPacket<T>>(elements);
|
|
1069
|
+
}
|
|
1070
|
+
template <>
|
|
1071
|
+
EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
|
|
1072
|
+
return pgather_hvx<HVXPacketSize::Full>(from, stride);
|
|
1073
|
+
}
|
|
1074
|
+
template <>
|
|
1075
|
+
EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
|
|
1076
|
+
return pgather_hvx<HVXPacketSize::Half>(from, stride);
|
|
1077
|
+
}
|
|
1078
|
+
template <>
|
|
1079
|
+
EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
|
|
1080
|
+
return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
} // end namespace internal
|
|
1084
|
+
} // end namespace Eigen
|
|
1085
|
+
|
|
1086
|
+
#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
|
|
1087
|
+
|
|
1088
|
+
#endif // EIGEN_HVX_PACKET_MATH_H
|