@smake/eigen 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -20
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +187 -120
- package/eigen/Eigen/Eigenvalues +16 -13
- package/eigen/Eigen/Geometry +18 -18
- package/eigen/Eigen/Householder +9 -7
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -13
- package/eigen/Eigen/KLUSupport +23 -21
- package/eigen/Eigen/LU +15 -16
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -51
- package/eigen/Eigen/PaStiXSupport +23 -21
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -20
- package/eigen/Eigen/QtAlignedMalloc +5 -12
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -17
- package/eigen/Eigen/Sparse +1 -2
- package/eigen/Eigen/SparseCholesky +18 -15
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +9 -9
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
- package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
- package/eigen/Eigen/src/Core/Array.h +329 -370
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
- package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
- package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
- package/eigen/Eigen/src/Core/Block.h +371 -390
- package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
- package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
- package/eigen/Eigen/src/Core/DenseBase.h +630 -658
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
- package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +168 -207
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +167 -217
- package/eigen/Eigen/src/Core/EigenBase.h +74 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
- package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
- package/eigen/Eigen/src/Core/IO.h +131 -156
- package/eigen/Eigen/src/Core/IndexedView.h +209 -125
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +50 -59
- package/eigen/Eigen/src/Core/Map.h +123 -141
- package/eigen/Eigen/src/Core/MapBase.h +255 -282
- package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
- package/eigen/Eigen/src/Core/Matrix.h +463 -494
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
- package/eigen/Eigen/src/Core/NestByValue.h +58 -52
- package/eigen/Eigen/src/Core/NoAlias.h +79 -86
- package/eigen/Eigen/src/Core/NumTraits.h +206 -206
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
- package/eigen/Eigen/src/Core/Product.h +246 -130
- package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
- package/eigen/Eigen/src/Core/Random.h +153 -164
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +334 -314
- package/eigen/Eigen/src/Core/Ref.h +259 -257
- package/eigen/Eigen/src/Core/Replicate.h +92 -104
- package/eigen/Eigen/src/Core/Reshaped.h +215 -271
- package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
- package/eigen/Eigen/src/Core/Reverse.h +133 -148
- package/eigen/Eigen/src/Core/Select.h +68 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +88 -102
- package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
- package/eigen/Eigen/src/Core/SolverBase.h +132 -133
- package/eigen/Eigen/src/Core/StableNorm.h +113 -147
- package/eigen/Eigen/src/Core/StlIterators.h +404 -248
- package/eigen/Eigen/src/Core/Stride.h +90 -92
- package/eigen/Eigen/src/Core/Swap.h +70 -39
- package/eigen/Eigen/src/Core/Transpose.h +258 -295
- package/eigen/Eigen/src/Core/Transpositions.h +270 -333
- package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
- package/eigen/Eigen/src/Core/Visitor.h +464 -308
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
- package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
- package/eigen/Eigen/src/Core/util/Constants.h +297 -262
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
- package/eigen/Eigen/src/Core/util/Macros.h +655 -773
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +970 -748
- package/eigen/Eigen/src/Core/util/Meta.h +581 -633
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
- package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
- package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
- package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
- package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
- package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
- package/eigen/Eigen/src/Geometry/Transform.h +858 -936
- package/eigen/Eigen/src/Geometry/Translation.h +94 -92
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
- package/eigen/Eigen/src/Householder/Householder.h +102 -124
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
- package/eigen/Eigen/src/LU/Determinant.h +50 -69
- package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
- package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
- package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
- package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/package.json +1 -1
- package/eigen/COPYING.APACHE +0 -203
- package/eigen/COPYING.BSD +0 -26
- package/eigen/COPYING.GPL +0 -674
- package/eigen/COPYING.LGPL +0 -502
- package/eigen/COPYING.MINPACK +0 -51
- package/eigen/COPYING.MPL2 +0 -373
- package/eigen/COPYING.README +0 -18
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
- package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
- package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
- package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
- package/eigen/README.md +0 -5
|
@@ -10,6 +10,9 @@
|
|
|
10
10
|
#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
|
|
11
11
|
#define EIGEN_PACKET_MATH_ZVECTOR_H
|
|
12
12
|
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
15
|
+
|
|
13
16
|
namespace Eigen {
|
|
14
17
|
|
|
15
18
|
namespace internal {
|
|
@@ -23,147 +26,147 @@ namespace internal {
|
|
|
23
26
|
#endif
|
|
24
27
|
|
|
25
28
|
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
26
|
-
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
29
|
+
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
|
27
30
|
#endif
|
|
28
31
|
|
|
29
|
-
typedef __vector int
|
|
30
|
-
typedef __vector unsigned int
|
|
31
|
-
typedef __vector __bool int
|
|
32
|
-
typedef __vector short int
|
|
33
|
-
typedef __vector unsigned char
|
|
34
|
-
typedef __vector double
|
|
35
|
-
typedef __vector unsigned long long
|
|
36
|
-
typedef __vector long long
|
|
32
|
+
typedef __vector int Packet4i;
|
|
33
|
+
typedef __vector unsigned int Packet4ui;
|
|
34
|
+
typedef __vector __bool int Packet4bi;
|
|
35
|
+
typedef __vector short int Packet8i;
|
|
36
|
+
typedef __vector unsigned char Packet16uc;
|
|
37
|
+
typedef __vector double Packet2d;
|
|
38
|
+
typedef __vector unsigned long long Packet2ul;
|
|
39
|
+
typedef __vector long long Packet2l;
|
|
37
40
|
|
|
38
41
|
// Z14 has builtin support for float vectors
|
|
39
42
|
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
40
|
-
typedef __vector float
|
|
43
|
+
typedef __vector float Packet4f;
|
|
41
44
|
#else
|
|
42
45
|
typedef struct {
|
|
43
|
-
|
|
46
|
+
Packet2d v4f[2];
|
|
44
47
|
} Packet4f;
|
|
45
48
|
#endif
|
|
46
49
|
|
|
47
50
|
typedef union {
|
|
48
|
-
numext::int32_t
|
|
51
|
+
numext::int32_t i[4];
|
|
49
52
|
numext::uint32_t ui[4];
|
|
50
|
-
numext::int64_t
|
|
53
|
+
numext::int64_t l[2];
|
|
51
54
|
numext::uint64_t ul[2];
|
|
52
|
-
double
|
|
53
|
-
float
|
|
54
|
-
Packet4i
|
|
55
|
+
double d[2];
|
|
56
|
+
float f[4];
|
|
57
|
+
Packet4i v4i;
|
|
55
58
|
Packet4ui v4ui;
|
|
56
|
-
Packet2l
|
|
59
|
+
Packet2l v2l;
|
|
57
60
|
Packet2ul v2ul;
|
|
58
|
-
Packet2d
|
|
61
|
+
Packet2d v2d;
|
|
59
62
|
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
60
|
-
Packet4f
|
|
63
|
+
Packet4f v4f;
|
|
61
64
|
#endif
|
|
62
65
|
} Packet;
|
|
63
66
|
|
|
64
67
|
// We don't want to write the same code all the time, but we need to reuse the constants
|
|
65
68
|
// and it doesn't really work to declare them global, so we define macros instead
|
|
66
69
|
|
|
67
|
-
#define
|
|
68
|
-
Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
|
|
70
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
|
|
69
71
|
|
|
70
|
-
#define
|
|
71
|
-
Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
|
|
72
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
|
|
72
73
|
|
|
73
|
-
#define
|
|
74
|
-
Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
|
|
74
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
|
|
75
75
|
|
|
76
|
-
#define
|
|
77
|
-
Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
|
76
|
+
#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
|
78
77
|
|
|
79
|
-
#define
|
|
80
|
-
Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
|
78
|
+
#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
|
81
79
|
|
|
82
|
-
#define
|
|
83
|
-
Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
|
80
|
+
#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
|
84
81
|
|
|
85
82
|
// These constants are endian-agnostic
|
|
86
|
-
static
|
|
87
|
-
static
|
|
83
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
|
|
84
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
|
|
88
85
|
|
|
89
|
-
static
|
|
90
|
-
static
|
|
91
|
-
static
|
|
86
|
+
static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
|
|
87
|
+
static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
|
|
88
|
+
static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
|
|
92
89
|
|
|
93
|
-
static Packet2d p2d_ONE = {
|
|
94
|
-
static Packet2d p2d_ZERO_ = {
|
|
95
|
-
|
|
90
|
+
static Packet2d p2d_ONE = {1.0, 1.0};
|
|
91
|
+
static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
|
|
92
|
+
numext::bit_cast<double>(0x8000000000000000ull)};
|
|
96
93
|
|
|
97
94
|
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
98
|
-
#define
|
|
99
|
-
Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
|
|
95
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
|
|
100
96
|
|
|
101
|
-
#define
|
|
102
|
-
Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
|
97
|
+
#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
|
103
98
|
|
|
104
|
-
#define
|
|
99
|
+
#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
|
|
105
100
|
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
|
|
106
101
|
|
|
107
|
-
static
|
|
108
|
-
static
|
|
109
|
-
static Packet4f p4f_MZERO = {
|
|
102
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
|
|
103
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
|
|
104
|
+
static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
|
|
110
105
|
#endif
|
|
111
106
|
|
|
112
|
-
static Packet4i p4i_COUNTDOWN = {
|
|
113
|
-
static Packet4f p4f_COUNTDOWN = {
|
|
114
|
-
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
|
|
107
|
+
static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
|
|
108
|
+
static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
|
|
109
|
+
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
|
|
110
|
+
vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
|
|
115
111
|
|
|
116
|
-
static Packet16uc p16uc_PSET64_HI = {
|
|
117
|
-
static Packet16uc p16uc_DUPLICATE32_HI = {
|
|
112
|
+
static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
|
|
113
|
+
static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
|
|
118
114
|
|
|
119
115
|
// Mask alignment
|
|
120
|
-
#define
|
|
116
|
+
#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
|
|
121
117
|
|
|
122
|
-
#define
|
|
118
|
+
#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
|
|
123
119
|
|
|
124
120
|
// Handle endianness properly while loading constants
|
|
125
121
|
// Define global static constants:
|
|
126
122
|
|
|
127
|
-
static Packet16uc p16uc_FORWARD =
|
|
128
|
-
static Packet16uc p16uc_REVERSE32 = {
|
|
129
|
-
static Packet16uc p16uc_REVERSE64 = {
|
|
130
|
-
|
|
131
|
-
static Packet16uc p16uc_PSET32_WODD
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
static Packet16uc
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
static Packet16uc
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
static Packet16uc
|
|
145
|
-
|
|
123
|
+
static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
124
|
+
static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
|
|
125
|
+
static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
|
|
126
|
+
|
|
127
|
+
static Packet16uc p16uc_PSET32_WODD =
|
|
128
|
+
vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
|
|
129
|
+
8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
|
130
|
+
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
|
|
131
|
+
8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
|
132
|
+
/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
|
|
133
|
+
8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
|
134
|
+
|
|
135
|
+
static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
|
|
136
|
+
(Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
|
|
137
|
+
static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
|
|
138
|
+
(Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
|
|
139
|
+
/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
|
|
140
|
+
16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
|
|
141
|
+
8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
|
|
142
|
+
static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
|
143
|
+
static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
144
|
+
|
|
145
|
+
static Packet16uc p16uc_COMPLEX32_REV =
|
|
146
|
+
vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
|
|
147
|
+
|
|
148
|
+
static Packet16uc p16uc_COMPLEX32_REV2 =
|
|
149
|
+
vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
|
146
150
|
|
|
147
151
|
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
|
148
|
-
|
|
152
|
+
#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
|
|
149
153
|
#else
|
|
150
|
-
|
|
154
|
+
#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm(" pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
|
|
151
155
|
#endif
|
|
152
156
|
|
|
153
|
-
template<>
|
|
154
|
-
{
|
|
157
|
+
template <>
|
|
158
|
+
struct packet_traits<int> : default_packet_traits {
|
|
155
159
|
typedef Packet4i type;
|
|
156
160
|
typedef Packet4i half;
|
|
157
161
|
enum {
|
|
158
162
|
Vectorizable = 1,
|
|
159
163
|
AlignedOnScalar = 1,
|
|
160
164
|
size = 4,
|
|
161
|
-
HasHalfPacket = 0,
|
|
162
165
|
|
|
163
|
-
HasAdd
|
|
164
|
-
HasSub
|
|
165
|
-
HasMul
|
|
166
|
-
HasDiv
|
|
166
|
+
HasAdd = 1,
|
|
167
|
+
HasSub = 1,
|
|
168
|
+
HasMul = 1,
|
|
169
|
+
HasDiv = 1,
|
|
167
170
|
HasBlend = 1
|
|
168
171
|
};
|
|
169
172
|
};
|
|
@@ -176,8 +179,8 @@ struct packet_traits<float> : default_packet_traits {
|
|
|
176
179
|
Vectorizable = 1,
|
|
177
180
|
AlignedOnScalar = 1,
|
|
178
181
|
size = 4,
|
|
179
|
-
HasHalfPacket = 0,
|
|
180
182
|
|
|
183
|
+
HasCmp = 1,
|
|
181
184
|
HasAdd = 1,
|
|
182
185
|
HasSub = 1,
|
|
183
186
|
HasMul = 1,
|
|
@@ -193,86 +196,109 @@ struct packet_traits<float> : default_packet_traits {
|
|
|
193
196
|
HasRsqrt = 1,
|
|
194
197
|
HasTanh = 1,
|
|
195
198
|
HasErf = 1,
|
|
196
|
-
HasRound = 1,
|
|
197
|
-
HasFloor = 1,
|
|
198
|
-
HasCeil = 1,
|
|
199
199
|
HasNegate = 1,
|
|
200
200
|
HasBlend = 1
|
|
201
201
|
};
|
|
202
202
|
};
|
|
203
203
|
|
|
204
|
-
template<>
|
|
205
|
-
{
|
|
204
|
+
template <>
|
|
205
|
+
struct packet_traits<double> : default_packet_traits {
|
|
206
206
|
typedef Packet2d type;
|
|
207
207
|
typedef Packet2d half;
|
|
208
208
|
enum {
|
|
209
209
|
Vectorizable = 1,
|
|
210
210
|
AlignedOnScalar = 1,
|
|
211
|
-
size=2,
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
HasExp = 1,
|
|
211
|
+
size = 2,
|
|
212
|
+
|
|
213
|
+
HasAdd = 1,
|
|
214
|
+
HasSub = 1,
|
|
215
|
+
HasMul = 1,
|
|
216
|
+
HasDiv = 1,
|
|
217
|
+
HasMin = 1,
|
|
218
|
+
HasMax = 1,
|
|
219
|
+
HasAbs = 1,
|
|
220
|
+
HasSin = 0,
|
|
221
|
+
HasCos = 0,
|
|
222
|
+
HasLog = 0,
|
|
223
|
+
HasExp = 1,
|
|
225
224
|
HasSqrt = 1,
|
|
226
225
|
HasRsqrt = 1,
|
|
227
|
-
HasRound = 1,
|
|
228
|
-
HasFloor = 1,
|
|
229
|
-
HasCeil = 1,
|
|
230
226
|
HasNegate = 1,
|
|
231
227
|
HasBlend = 1
|
|
232
228
|
};
|
|
233
229
|
};
|
|
234
230
|
|
|
235
|
-
template<>
|
|
236
|
-
|
|
237
|
-
|
|
231
|
+
template <>
|
|
232
|
+
struct unpacket_traits<Packet4i> {
|
|
233
|
+
typedef int type;
|
|
234
|
+
enum {
|
|
235
|
+
size = 4,
|
|
236
|
+
alignment = Aligned16,
|
|
237
|
+
vectorizable = true,
|
|
238
|
+
masked_load_available = false,
|
|
239
|
+
masked_store_available = false
|
|
240
|
+
};
|
|
241
|
+
typedef Packet4i half;
|
|
242
|
+
};
|
|
243
|
+
template <>
|
|
244
|
+
struct unpacket_traits<Packet4f> {
|
|
245
|
+
typedef float type;
|
|
246
|
+
enum {
|
|
247
|
+
size = 4,
|
|
248
|
+
alignment = Aligned16,
|
|
249
|
+
vectorizable = true,
|
|
250
|
+
masked_load_available = false,
|
|
251
|
+
masked_store_available = false
|
|
252
|
+
};
|
|
253
|
+
typedef Packet4f half;
|
|
254
|
+
typedef Packet4i integer_packet;
|
|
255
|
+
};
|
|
256
|
+
template <>
|
|
257
|
+
struct unpacket_traits<Packet2d> {
|
|
258
|
+
typedef double type;
|
|
259
|
+
enum {
|
|
260
|
+
size = 2,
|
|
261
|
+
alignment = Aligned16,
|
|
262
|
+
vectorizable = true,
|
|
263
|
+
masked_load_available = false,
|
|
264
|
+
masked_store_available = false
|
|
265
|
+
};
|
|
266
|
+
typedef Packet2d half;
|
|
267
|
+
typedef Packet2l integer_packet;
|
|
268
|
+
};
|
|
238
269
|
|
|
239
270
|
/* Forward declaration */
|
|
240
|
-
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
|
|
241
|
-
|
|
242
|
-
inline std::ostream
|
|
243
|
-
{
|
|
271
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
|
|
272
|
+
|
|
273
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
|
|
244
274
|
Packet vt;
|
|
245
275
|
vt.v4i = v;
|
|
246
276
|
s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
|
|
247
277
|
return s;
|
|
248
278
|
}
|
|
249
279
|
|
|
250
|
-
inline std::ostream
|
|
251
|
-
{
|
|
280
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
|
|
252
281
|
Packet vt;
|
|
253
282
|
vt.v4ui = v;
|
|
254
283
|
s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
|
|
255
284
|
return s;
|
|
256
285
|
}
|
|
257
286
|
|
|
258
|
-
inline std::ostream
|
|
259
|
-
{
|
|
287
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
|
|
260
288
|
Packet vt;
|
|
261
289
|
vt.v2l = v;
|
|
262
290
|
s << vt.l[0] << ", " << vt.l[1];
|
|
263
291
|
return s;
|
|
264
292
|
}
|
|
265
293
|
|
|
266
|
-
inline std::ostream
|
|
267
|
-
{
|
|
294
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
|
|
268
295
|
Packet vt;
|
|
269
296
|
vt.v2ul = v;
|
|
270
|
-
s << vt.ul[0] << ", " << vt.ul[1]
|
|
297
|
+
s << vt.ul[0] << ", " << vt.ul[1];
|
|
271
298
|
return s;
|
|
272
299
|
}
|
|
273
300
|
|
|
274
|
-
inline std::ostream
|
|
275
|
-
{
|
|
301
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
|
|
276
302
|
Packet vt;
|
|
277
303
|
vt.v2d = v;
|
|
278
304
|
s << vt.d[0] << ", " << vt.d[1];
|
|
@@ -280,8 +306,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
|
|
|
280
306
|
}
|
|
281
307
|
|
|
282
308
|
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
283
|
-
inline std::ostream
|
|
284
|
-
{
|
|
309
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
|
|
285
310
|
Packet vt;
|
|
286
311
|
vt.v4f = v;
|
|
287
312
|
s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
|
|
@@ -289,54 +314,51 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
|
|
|
289
314
|
}
|
|
290
315
|
#endif
|
|
291
316
|
|
|
292
|
-
template<>
|
|
293
|
-
{
|
|
294
|
-
// FIXME: No intrinsic yet
|
|
317
|
+
template <>
|
|
318
|
+
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
|
|
295
319
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
296
|
-
|
|
297
|
-
vfrom = (Packet *) from;
|
|
298
|
-
return vfrom->v4i;
|
|
320
|
+
return vec_xl(0, from);
|
|
299
321
|
}
|
|
300
322
|
|
|
301
|
-
template<>
|
|
302
|
-
{
|
|
303
|
-
// FIXME: No intrinsic yet
|
|
323
|
+
template <>
|
|
324
|
+
EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
|
|
304
325
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
305
|
-
|
|
306
|
-
vfrom = (Packet *) from;
|
|
307
|
-
return vfrom->v2d;
|
|
326
|
+
return vec_xl(0, from);
|
|
308
327
|
}
|
|
309
328
|
|
|
310
|
-
template<>
|
|
311
|
-
{
|
|
312
|
-
// FIXME: No intrinsic yet
|
|
329
|
+
template <>
|
|
330
|
+
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
|
|
313
331
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
314
|
-
|
|
315
|
-
vto = (Packet *) to;
|
|
316
|
-
vto->v4i = from;
|
|
332
|
+
vec_xst(from, 0, to);
|
|
317
333
|
}
|
|
318
334
|
|
|
319
|
-
template<>
|
|
320
|
-
{
|
|
321
|
-
// FIXME: No intrinsic yet
|
|
335
|
+
template <>
|
|
336
|
+
EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
|
|
322
337
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
338
|
+
vec_xst(from, 0, to);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
template <>
|
|
342
|
+
EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
|
|
343
|
+
return pfrexp_generic(a, exponent);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
template <>
|
|
347
|
+
EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
|
|
348
|
+
return pfrexp_generic(a, exponent);
|
|
326
349
|
}
|
|
327
350
|
|
|
328
|
-
template<>
|
|
329
|
-
{
|
|
351
|
+
template <>
|
|
352
|
+
EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
|
|
330
353
|
return vec_splats(from);
|
|
331
354
|
}
|
|
332
|
-
template<>
|
|
355
|
+
template <>
|
|
356
|
+
EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
|
333
357
|
return vec_splats(from);
|
|
334
358
|
}
|
|
335
359
|
|
|
336
|
-
template<>
|
|
337
|
-
pbroadcast4<Packet4i>(const int
|
|
338
|
-
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
|
|
339
|
-
{
|
|
360
|
+
template <>
|
|
361
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
|
|
340
362
|
a3 = pload<Packet4i>(a);
|
|
341
363
|
a0 = vec_splat(a3, 0);
|
|
342
364
|
a1 = vec_splat(a3, 1);
|
|
@@ -344,187 +366,356 @@ pbroadcast4<Packet4i>(const int *a,
|
|
|
344
366
|
a3 = vec_splat(a3, 3);
|
|
345
367
|
}
|
|
346
368
|
|
|
347
|
-
template<>
|
|
348
|
-
pbroadcast4<Packet2d>(const double
|
|
349
|
-
|
|
350
|
-
{
|
|
369
|
+
template <>
|
|
370
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
|
|
371
|
+
Packet2d& a3) {
|
|
351
372
|
a1 = pload<Packet2d>(a);
|
|
352
373
|
a0 = vec_splat(a1, 0);
|
|
353
374
|
a1 = vec_splat(a1, 1);
|
|
354
|
-
a3 = pload<Packet2d>(a+2);
|
|
375
|
+
a3 = pload<Packet2d>(a + 2);
|
|
355
376
|
a2 = vec_splat(a3, 0);
|
|
356
377
|
a3 = vec_splat(a3, 1);
|
|
357
378
|
}
|
|
358
379
|
|
|
359
|
-
template<>
|
|
360
|
-
{
|
|
361
|
-
int
|
|
362
|
-
ai[0] = from[0*stride];
|
|
363
|
-
ai[1] = from[1*stride];
|
|
364
|
-
ai[2] = from[2*stride];
|
|
365
|
-
ai[3] = from[3*stride];
|
|
366
|
-
|
|
380
|
+
template <>
|
|
381
|
+
EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
|
|
382
|
+
EIGEN_ALIGN16 int ai[4];
|
|
383
|
+
ai[0] = from[0 * stride];
|
|
384
|
+
ai[1] = from[1 * stride];
|
|
385
|
+
ai[2] = from[2 * stride];
|
|
386
|
+
ai[3] = from[3 * stride];
|
|
387
|
+
return pload<Packet4i>(ai);
|
|
367
388
|
}
|
|
368
389
|
|
|
369
|
-
template<>
|
|
370
|
-
{
|
|
371
|
-
double
|
|
372
|
-
af[0] = from[0*stride];
|
|
373
|
-
af[1] = from[1*stride];
|
|
374
|
-
|
|
390
|
+
template <>
|
|
391
|
+
EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
|
|
392
|
+
EIGEN_ALIGN16 double af[2];
|
|
393
|
+
af[0] = from[0 * stride];
|
|
394
|
+
af[1] = from[1 * stride];
|
|
395
|
+
return pload<Packet2d>(af);
|
|
375
396
|
}
|
|
376
397
|
|
|
377
|
-
template<>
|
|
378
|
-
{
|
|
379
|
-
int
|
|
380
|
-
pstore<int>((int
|
|
381
|
-
to[0*stride] = ai[0];
|
|
382
|
-
to[1*stride] = ai[1];
|
|
383
|
-
to[2*stride] = ai[2];
|
|
384
|
-
to[3*stride] = ai[3];
|
|
398
|
+
template <>
|
|
399
|
+
EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
|
|
400
|
+
EIGEN_ALIGN16 int ai[4];
|
|
401
|
+
pstore<int>((int*)ai, from);
|
|
402
|
+
to[0 * stride] = ai[0];
|
|
403
|
+
to[1 * stride] = ai[1];
|
|
404
|
+
to[2 * stride] = ai[2];
|
|
405
|
+
to[3 * stride] = ai[3];
|
|
385
406
|
}
|
|
386
407
|
|
|
387
|
-
template<>
|
|
388
|
-
{
|
|
389
|
-
double
|
|
408
|
+
template <>
|
|
409
|
+
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
|
|
410
|
+
EIGEN_ALIGN16 double af[2];
|
|
390
411
|
pstore<double>(af, from);
|
|
391
|
-
to[0*stride] = af[0];
|
|
392
|
-
to[1*stride] = af[1];
|
|
412
|
+
to[0 * stride] = af[0];
|
|
413
|
+
to[1 * stride] = af[1];
|
|
393
414
|
}
|
|
394
415
|
|
|
395
|
-
template<>
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
template<>
|
|
416
|
+
template <>
|
|
417
|
+
EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
418
|
+
return (a + b);
|
|
419
|
+
}
|
|
420
|
+
template <>
|
|
421
|
+
EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
422
|
+
return (a + b);
|
|
423
|
+
}
|
|
400
424
|
|
|
401
|
-
template<>
|
|
402
|
-
|
|
425
|
+
template <>
|
|
426
|
+
EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
427
|
+
return (a - b);
|
|
428
|
+
}
|
|
429
|
+
template <>
|
|
430
|
+
EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
431
|
+
return (a - b);
|
|
432
|
+
}
|
|
403
433
|
|
|
404
|
-
template<>
|
|
405
|
-
|
|
434
|
+
template <>
|
|
435
|
+
EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
436
|
+
return (a * b);
|
|
437
|
+
}
|
|
438
|
+
template <>
|
|
439
|
+
EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
440
|
+
return (a * b);
|
|
441
|
+
}
|
|
406
442
|
|
|
407
|
-
template<>
|
|
408
|
-
|
|
443
|
+
template <>
|
|
444
|
+
EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
445
|
+
return (a / b);
|
|
446
|
+
}
|
|
447
|
+
template <>
|
|
448
|
+
EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
449
|
+
return (a / b);
|
|
450
|
+
}
|
|
409
451
|
|
|
410
|
-
template<>
|
|
411
|
-
|
|
452
|
+
template <>
|
|
453
|
+
EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
|
|
454
|
+
return (-a);
|
|
455
|
+
}
|
|
456
|
+
template <>
|
|
457
|
+
EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
|
|
458
|
+
return (-a);
|
|
459
|
+
}
|
|
412
460
|
|
|
413
|
-
template<>
|
|
414
|
-
|
|
461
|
+
template <>
|
|
462
|
+
EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
|
|
463
|
+
return a;
|
|
464
|
+
}
|
|
465
|
+
template <>
|
|
466
|
+
EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
|
|
467
|
+
return a;
|
|
468
|
+
}
|
|
415
469
|
|
|
416
|
-
template<>
|
|
417
|
-
|
|
470
|
+
template <>
|
|
471
|
+
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
472
|
+
return padd<Packet4i>(pmul<Packet4i>(a, b), c);
|
|
473
|
+
}
|
|
474
|
+
template <>
|
|
475
|
+
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
476
|
+
return vec_madd(a, b, c);
|
|
477
|
+
}
|
|
418
478
|
|
|
419
|
-
template<>
|
|
420
|
-
|
|
479
|
+
template <>
|
|
480
|
+
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
|
|
481
|
+
return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
|
|
482
|
+
}
|
|
483
|
+
template <>
|
|
484
|
+
EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
|
|
485
|
+
return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
|
|
486
|
+
}
|
|
421
487
|
|
|
422
|
-
template<>
|
|
423
|
-
|
|
488
|
+
template <>
|
|
489
|
+
EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
490
|
+
return vec_min(a, b);
|
|
491
|
+
}
|
|
492
|
+
template <>
|
|
493
|
+
EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
494
|
+
return vec_min(a, b);
|
|
495
|
+
}
|
|
424
496
|
|
|
425
|
-
template<>
|
|
426
|
-
|
|
497
|
+
template <>
|
|
498
|
+
EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
499
|
+
return vec_max(a, b);
|
|
500
|
+
}
|
|
501
|
+
template <>
|
|
502
|
+
EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
503
|
+
return vec_max(a, b);
|
|
504
|
+
}
|
|
427
505
|
|
|
428
|
-
template<>
|
|
429
|
-
|
|
506
|
+
template <>
|
|
507
|
+
EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
508
|
+
return vec_and(a, b);
|
|
509
|
+
}
|
|
510
|
+
template <>
|
|
511
|
+
EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
512
|
+
return vec_and(a, b);
|
|
513
|
+
}
|
|
430
514
|
|
|
431
|
-
template<>
|
|
432
|
-
|
|
515
|
+
template <>
|
|
516
|
+
EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
517
|
+
return vec_or(a, b);
|
|
518
|
+
}
|
|
519
|
+
template <>
|
|
520
|
+
EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
521
|
+
return vec_or(a, b);
|
|
522
|
+
}
|
|
433
523
|
|
|
434
|
-
template<>
|
|
435
|
-
|
|
524
|
+
template <>
|
|
525
|
+
EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
526
|
+
return vec_xor(a, b);
|
|
527
|
+
}
|
|
528
|
+
template <>
|
|
529
|
+
EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
530
|
+
return vec_xor(a, b);
|
|
531
|
+
}
|
|
436
532
|
|
|
437
|
-
template<>
|
|
438
|
-
|
|
439
|
-
|
|
533
|
+
template <>
|
|
534
|
+
EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
535
|
+
return pand<Packet4i>(a, vec_nor(b, b));
|
|
536
|
+
}
|
|
537
|
+
template <>
|
|
538
|
+
EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
539
|
+
return vec_and(a, vec_nor(b, b));
|
|
540
|
+
}
|
|
440
541
|
|
|
441
|
-
template<>
|
|
442
|
-
|
|
542
|
+
template <>
|
|
543
|
+
EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
|
|
544
|
+
/* Uses non-default rounding for vec_round */
|
|
545
|
+
return __builtin_s390_vfidb(a, 0, 1);
|
|
546
|
+
}
|
|
547
|
+
template <>
|
|
548
|
+
EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
|
|
549
|
+
return vec_ceil(a);
|
|
550
|
+
}
|
|
551
|
+
template <>
|
|
552
|
+
EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
|
|
553
|
+
return vec_floor(a);
|
|
554
|
+
}
|
|
443
555
|
|
|
556
|
+
template <>
|
|
557
|
+
EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
|
|
558
|
+
return pload<Packet4i>(from);
|
|
559
|
+
}
|
|
560
|
+
template <>
|
|
561
|
+
EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
|
|
562
|
+
return pload<Packet2d>(from);
|
|
563
|
+
}
|
|
444
564
|
|
|
445
|
-
template<>
|
|
446
|
-
{
|
|
565
|
+
template <>
|
|
566
|
+
EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
|
|
447
567
|
Packet4i p = pload<Packet4i>(from);
|
|
448
568
|
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
|
449
569
|
}
|
|
450
570
|
|
|
451
|
-
template<>
|
|
452
|
-
{
|
|
571
|
+
template <>
|
|
572
|
+
EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
|
|
453
573
|
Packet2d p = pload<Packet2d>(from);
|
|
454
574
|
return vec_perm(p, p, p16uc_PSET64_HI);
|
|
455
575
|
}
|
|
456
576
|
|
|
457
|
-
template<>
|
|
458
|
-
|
|
577
|
+
template <>
|
|
578
|
+
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
|
|
579
|
+
pstore<int>(to, from);
|
|
580
|
+
}
|
|
581
|
+
template <>
|
|
582
|
+
EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
|
|
583
|
+
pstore<double>(to, from);
|
|
584
|
+
}
|
|
459
585
|
|
|
460
|
-
template<>
|
|
461
|
-
|
|
586
|
+
template <>
|
|
587
|
+
EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
|
|
588
|
+
EIGEN_ZVECTOR_PREFETCH(addr);
|
|
589
|
+
}
|
|
590
|
+
template <>
|
|
591
|
+
EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
|
|
592
|
+
EIGEN_ZVECTOR_PREFETCH(addr);
|
|
593
|
+
}
|
|
462
594
|
|
|
463
|
-
template
|
|
464
|
-
|
|
595
|
+
template <int N>
|
|
596
|
+
EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
|
|
597
|
+
return Packet2l { parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]) };
|
|
598
|
+
}
|
|
599
|
+
template <int N>
|
|
600
|
+
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
|
|
601
|
+
return Packet4i {
|
|
602
|
+
parithmetic_shift_right<N>(a[0]),
|
|
603
|
+
parithmetic_shift_right<N>(a[1]),
|
|
604
|
+
parithmetic_shift_right<N>(a[2]),
|
|
605
|
+
parithmetic_shift_right<N>(a[3]) };
|
|
606
|
+
}
|
|
465
607
|
|
|
466
|
-
template
|
|
467
|
-
{
|
|
468
|
-
return
|
|
608
|
+
template <int N>
|
|
609
|
+
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
|
610
|
+
return Packet2l { plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]) };
|
|
611
|
+
}
|
|
612
|
+
template <int N>
|
|
613
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
|
|
614
|
+
return Packet4i {
|
|
615
|
+
plogical_shift_right<N>(a[0]),
|
|
616
|
+
plogical_shift_right<N>(a[1]),
|
|
617
|
+
plogical_shift_right<N>(a[2]),
|
|
618
|
+
plogical_shift_right<N>(a[3]) };
|
|
469
619
|
}
|
|
470
620
|
|
|
471
|
-
template
|
|
472
|
-
{
|
|
473
|
-
return
|
|
621
|
+
template <int N>
|
|
622
|
+
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
|
|
623
|
+
return Packet2l { plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]) };
|
|
624
|
+
}
|
|
625
|
+
template <int N>
|
|
626
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
|
|
627
|
+
return Packet4i {
|
|
628
|
+
plogical_shift_left<N>(a[0]),
|
|
629
|
+
plogical_shift_left<N>(a[1]),
|
|
630
|
+
plogical_shift_left<N>(a[2]),
|
|
631
|
+
plogical_shift_left<N>(a[3]) };
|
|
474
632
|
}
|
|
475
633
|
|
|
476
|
-
template<>
|
|
477
|
-
|
|
634
|
+
template <>
|
|
635
|
+
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
|
636
|
+
EIGEN_ALIGN16 int x[4];
|
|
637
|
+
pstore(x, a);
|
|
638
|
+
return x[0];
|
|
639
|
+
}
|
|
640
|
+
template <>
|
|
641
|
+
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
|
642
|
+
EIGEN_ALIGN16 double x[2];
|
|
643
|
+
pstore(x, a);
|
|
644
|
+
return x[0];
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
template <>
|
|
648
|
+
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
|
|
649
|
+
return reinterpret_cast<Packet4i>(
|
|
650
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
651
|
+
}
|
|
478
652
|
|
|
479
|
-
template<>
|
|
480
|
-
{
|
|
653
|
+
template <>
|
|
654
|
+
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
|
|
655
|
+
return reinterpret_cast<Packet2d>(
|
|
656
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
template <>
|
|
660
|
+
EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
|
|
661
|
+
return vec_abs(a);
|
|
662
|
+
}
|
|
663
|
+
template <>
|
|
664
|
+
EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
|
|
665
|
+
return vec_abs(a);
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
template <>
|
|
669
|
+
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
|
|
481
670
|
Packet4i b, sum;
|
|
482
|
-
b
|
|
671
|
+
b = vec_sld(a, a, 8);
|
|
483
672
|
sum = padd<Packet4i>(a, b);
|
|
484
|
-
b
|
|
673
|
+
b = vec_sld(sum, sum, 4);
|
|
485
674
|
sum = padd<Packet4i>(sum, b);
|
|
486
675
|
return pfirst(sum);
|
|
487
676
|
}
|
|
488
677
|
|
|
489
|
-
template<>
|
|
490
|
-
{
|
|
678
|
+
template <>
|
|
679
|
+
EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
|
|
491
680
|
Packet2d b, sum;
|
|
492
|
-
b
|
|
681
|
+
b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
|
|
493
682
|
sum = padd<Packet2d>(a, b);
|
|
494
683
|
return pfirst(sum);
|
|
495
684
|
}
|
|
496
685
|
|
|
497
686
|
// Other reduction functions:
|
|
498
687
|
// mul
|
|
499
|
-
template<>
|
|
500
|
-
{
|
|
688
|
+
template <>
|
|
689
|
+
EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
|
|
501
690
|
EIGEN_ALIGN16 int aux[4];
|
|
502
691
|
pstore(aux, a);
|
|
503
692
|
return aux[0] * aux[1] * aux[2] * aux[3];
|
|
504
693
|
}
|
|
505
694
|
|
|
506
|
-
template<>
|
|
507
|
-
{
|
|
508
|
-
return pfirst(
|
|
695
|
+
template <>
|
|
696
|
+
EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
|
|
697
|
+
return pfirst(
|
|
698
|
+
pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
509
699
|
}
|
|
510
700
|
|
|
511
701
|
// min
|
|
512
|
-
template<>
|
|
513
|
-
{
|
|
702
|
+
template <>
|
|
703
|
+
EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
|
|
514
704
|
Packet4i b, res;
|
|
515
|
-
b
|
|
705
|
+
b = pmin<Packet4i>(a, vec_sld(a, a, 8));
|
|
516
706
|
res = pmin<Packet4i>(b, vec_sld(b, b, 4));
|
|
517
707
|
return pfirst(res);
|
|
518
708
|
}
|
|
519
709
|
|
|
520
|
-
template<>
|
|
521
|
-
{
|
|
522
|
-
return pfirst(pmin<Packet2d>(
|
|
710
|
+
template <>
|
|
711
|
+
EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
|
|
712
|
+
return pfirst(pmin<Packet2d>(
|
|
713
|
+
a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
523
714
|
}
|
|
524
715
|
|
|
525
716
|
// max
|
|
526
|
-
template<>
|
|
527
|
-
{
|
|
717
|
+
template <>
|
|
718
|
+
EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
|
|
528
719
|
Packet4i b, res;
|
|
529
720
|
b = pmax<Packet4i>(a, vec_sld(a, a, 8));
|
|
530
721
|
res = pmax<Packet4i>(b, vec_sld(b, b, 4));
|
|
@@ -532,13 +723,13 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
|
|
532
723
|
}
|
|
533
724
|
|
|
534
725
|
// max
|
|
535
|
-
template<>
|
|
536
|
-
{
|
|
537
|
-
return pfirst(pmax<Packet2d>(
|
|
726
|
+
template <>
|
|
727
|
+
EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
|
|
728
|
+
return pfirst(pmax<Packet2d>(
|
|
729
|
+
a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
538
730
|
}
|
|
539
731
|
|
|
540
|
-
EIGEN_DEVICE_FUNC inline void
|
|
541
|
-
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
|
732
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
|
|
542
733
|
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
543
734
|
Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
544
735
|
Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
|
@@ -549,23 +740,25 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
|
|
549
740
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
550
741
|
}
|
|
551
742
|
|
|
552
|
-
EIGEN_DEVICE_FUNC inline void
|
|
553
|
-
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
|
743
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
|
|
554
744
|
Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
|
|
555
745
|
Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
|
|
556
746
|
kernel.packet[0] = t0;
|
|
557
747
|
kernel.packet[1] = t1;
|
|
558
748
|
}
|
|
559
749
|
|
|
560
|
-
template<>
|
|
561
|
-
|
|
750
|
+
template <>
|
|
751
|
+
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
|
|
752
|
+
const Packet4i& elsePacket) {
|
|
753
|
+
Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
|
|
562
754
|
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
|
563
755
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
564
756
|
}
|
|
565
757
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
758
|
+
template <>
|
|
759
|
+
EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
|
|
760
|
+
const Packet2d& elsePacket) {
|
|
761
|
+
Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
|
|
569
762
|
Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
|
|
570
763
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
571
764
|
}
|
|
@@ -576,32 +769,32 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
|
|
|
576
769
|
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
|
|
577
770
|
/* Helper function to simulate a vec_splat_packet4f
|
|
578
771
|
*/
|
|
579
|
-
template<int element>
|
|
580
|
-
{
|
|
772
|
+
template <int element>
|
|
773
|
+
EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
|
|
581
774
|
Packet4f splat;
|
|
582
775
|
switch (element) {
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
776
|
+
case 0:
|
|
777
|
+
splat.v4f[0] = vec_splat(from.v4f[0], 0);
|
|
778
|
+
splat.v4f[1] = splat.v4f[0];
|
|
779
|
+
break;
|
|
780
|
+
case 1:
|
|
781
|
+
splat.v4f[0] = vec_splat(from.v4f[0], 1);
|
|
782
|
+
splat.v4f[1] = splat.v4f[0];
|
|
783
|
+
break;
|
|
784
|
+
case 2:
|
|
785
|
+
splat.v4f[0] = vec_splat(from.v4f[1], 0);
|
|
786
|
+
splat.v4f[1] = splat.v4f[0];
|
|
787
|
+
break;
|
|
788
|
+
case 3:
|
|
789
|
+
splat.v4f[0] = vec_splat(from.v4f[1], 1);
|
|
790
|
+
splat.v4f[1] = splat.v4f[0];
|
|
791
|
+
break;
|
|
599
792
|
}
|
|
600
793
|
return splat;
|
|
601
794
|
}
|
|
602
795
|
|
|
603
|
-
template<>
|
|
604
|
-
{
|
|
796
|
+
template <>
|
|
797
|
+
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
|
|
605
798
|
// FIXME: No intrinsic yet
|
|
606
799
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
607
800
|
Packet4f vfrom;
|
|
@@ -610,26 +803,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
|
|
610
803
|
return vfrom;
|
|
611
804
|
}
|
|
612
805
|
|
|
613
|
-
template<>
|
|
614
|
-
{
|
|
806
|
+
template <>
|
|
807
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
|
|
615
808
|
// FIXME: No intrinsic yet
|
|
616
809
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
617
810
|
vec_st2f(from.v4f[0], &to[0]);
|
|
618
811
|
vec_st2f(from.v4f[1], &to[2]);
|
|
619
812
|
}
|
|
620
813
|
|
|
621
|
-
template<>
|
|
622
|
-
{
|
|
814
|
+
template <>
|
|
815
|
+
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
|
623
816
|
Packet4f to;
|
|
624
817
|
to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
|
|
625
818
|
to.v4f[1] = to.v4f[0];
|
|
626
819
|
return to;
|
|
627
820
|
}
|
|
628
821
|
|
|
629
|
-
template<>
|
|
630
|
-
pbroadcast4<Packet4f>(const float
|
|
631
|
-
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
|
632
|
-
{
|
|
822
|
+
template <>
|
|
823
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
|
|
633
824
|
a3 = pload<Packet4f>(a);
|
|
634
825
|
a0 = vec_splat_packet4f<0>(a3);
|
|
635
826
|
a1 = vec_splat_packet4f<1>(a3);
|
|
@@ -637,207 +828,213 @@ pbroadcast4<Packet4f>(const float *a,
|
|
|
637
828
|
a3 = vec_splat_packet4f<3>(a3);
|
|
638
829
|
}
|
|
639
830
|
|
|
640
|
-
template<>
|
|
641
|
-
{
|
|
642
|
-
float
|
|
643
|
-
ai[0] = from[0*stride];
|
|
644
|
-
ai[1] = from[1*stride];
|
|
645
|
-
ai[2] = from[2*stride];
|
|
646
|
-
ai[3] = from[3*stride];
|
|
647
|
-
|
|
831
|
+
template <>
|
|
832
|
+
EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
|
833
|
+
EIGEN_ALIGN16 float ai[4];
|
|
834
|
+
ai[0] = from[0 * stride];
|
|
835
|
+
ai[1] = from[1 * stride];
|
|
836
|
+
ai[2] = from[2 * stride];
|
|
837
|
+
ai[3] = from[3 * stride];
|
|
838
|
+
return pload<Packet4f>(ai);
|
|
648
839
|
}
|
|
649
840
|
|
|
650
|
-
template<>
|
|
651
|
-
{
|
|
652
|
-
float
|
|
653
|
-
pstore<float>((float
|
|
654
|
-
to[0*stride] = ai[0];
|
|
655
|
-
to[1*stride] = ai[1];
|
|
656
|
-
to[2*stride] = ai[2];
|
|
657
|
-
to[3*stride] = ai[3];
|
|
841
|
+
template <>
|
|
842
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
|
843
|
+
EIGEN_ALIGN16 float ai[4];
|
|
844
|
+
pstore<float>((float*)ai, from);
|
|
845
|
+
to[0 * stride] = ai[0];
|
|
846
|
+
to[1 * stride] = ai[1];
|
|
847
|
+
to[2 * stride] = ai[2];
|
|
848
|
+
to[3 * stride] = ai[3];
|
|
658
849
|
}
|
|
659
850
|
|
|
660
|
-
template<>
|
|
661
|
-
{
|
|
851
|
+
template <>
|
|
852
|
+
EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
662
853
|
Packet4f c;
|
|
663
854
|
c.v4f[0] = a.v4f[0] + b.v4f[0];
|
|
664
855
|
c.v4f[1] = a.v4f[1] + b.v4f[1];
|
|
665
856
|
return c;
|
|
666
857
|
}
|
|
667
858
|
|
|
668
|
-
template<>
|
|
669
|
-
{
|
|
859
|
+
template <>
|
|
860
|
+
EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
670
861
|
Packet4f c;
|
|
671
862
|
c.v4f[0] = a.v4f[0] - b.v4f[0];
|
|
672
863
|
c.v4f[1] = a.v4f[1] - b.v4f[1];
|
|
673
864
|
return c;
|
|
674
865
|
}
|
|
675
866
|
|
|
676
|
-
template<>
|
|
677
|
-
{
|
|
867
|
+
template <>
|
|
868
|
+
EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
678
869
|
Packet4f c;
|
|
679
870
|
c.v4f[0] = a.v4f[0] * b.v4f[0];
|
|
680
871
|
c.v4f[1] = a.v4f[1] * b.v4f[1];
|
|
681
872
|
return c;
|
|
682
873
|
}
|
|
683
874
|
|
|
684
|
-
template<>
|
|
685
|
-
{
|
|
875
|
+
template <>
|
|
876
|
+
EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
686
877
|
Packet4f c;
|
|
687
878
|
c.v4f[0] = a.v4f[0] / b.v4f[0];
|
|
688
879
|
c.v4f[1] = a.v4f[1] / b.v4f[1];
|
|
689
880
|
return c;
|
|
690
881
|
}
|
|
691
882
|
|
|
692
|
-
template<>
|
|
693
|
-
{
|
|
883
|
+
template <>
|
|
884
|
+
EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
|
|
694
885
|
Packet4f c;
|
|
695
886
|
c.v4f[0] = -a.v4f[0];
|
|
696
887
|
c.v4f[1] = -a.v4f[1];
|
|
697
888
|
return c;
|
|
698
889
|
}
|
|
699
890
|
|
|
700
|
-
template<>
|
|
701
|
-
{
|
|
891
|
+
template <>
|
|
892
|
+
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
702
893
|
Packet4f res;
|
|
703
894
|
res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
|
|
704
895
|
res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
|
|
705
896
|
return res;
|
|
706
897
|
}
|
|
707
898
|
|
|
708
|
-
template<>
|
|
709
|
-
{
|
|
899
|
+
template <>
|
|
900
|
+
EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
710
901
|
Packet4f res;
|
|
711
902
|
res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
|
|
712
903
|
res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
|
|
713
904
|
return res;
|
|
714
905
|
}
|
|
715
906
|
|
|
716
|
-
template<>
|
|
717
|
-
{
|
|
907
|
+
template <>
|
|
908
|
+
EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
718
909
|
Packet4f res;
|
|
719
910
|
res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
|
|
720
911
|
res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
|
|
721
912
|
return res;
|
|
722
913
|
}
|
|
723
914
|
|
|
724
|
-
template<>
|
|
725
|
-
{
|
|
915
|
+
template <>
|
|
916
|
+
EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
726
917
|
Packet4f res;
|
|
727
918
|
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
|
|
728
919
|
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
|
|
729
920
|
return res;
|
|
730
921
|
}
|
|
731
922
|
|
|
732
|
-
template<>
|
|
733
|
-
{
|
|
923
|
+
template <>
|
|
924
|
+
EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
734
925
|
Packet4f res;
|
|
735
926
|
res.v4f[0] = por(a.v4f[0], b.v4f[0]);
|
|
736
927
|
res.v4f[1] = por(a.v4f[1], b.v4f[1]);
|
|
737
928
|
return res;
|
|
738
929
|
}
|
|
739
930
|
|
|
740
|
-
template<>
|
|
741
|
-
{
|
|
931
|
+
template <>
|
|
932
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
742
933
|
Packet4f res;
|
|
743
934
|
res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
|
|
744
935
|
res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
|
|
745
936
|
return res;
|
|
746
937
|
}
|
|
747
938
|
|
|
748
|
-
template<>
|
|
749
|
-
{
|
|
939
|
+
template <>
|
|
940
|
+
EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
750
941
|
Packet4f res;
|
|
751
942
|
res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
|
|
752
943
|
res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
|
|
753
944
|
return res;
|
|
754
945
|
}
|
|
755
946
|
|
|
756
|
-
template<>
|
|
757
|
-
{
|
|
947
|
+
template <>
|
|
948
|
+
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
|
|
758
949
|
Packet4f res;
|
|
759
|
-
res.v4f[0] =
|
|
760
|
-
res.v4f[1] =
|
|
950
|
+
res.v4f[0] = generic_round(a.v4f[0]);
|
|
951
|
+
res.v4f[1] = generic_round(a.v4f[1]);
|
|
761
952
|
return res;
|
|
762
953
|
}
|
|
763
954
|
|
|
764
|
-
template<>
|
|
765
|
-
{
|
|
955
|
+
template <>
|
|
956
|
+
EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
|
|
766
957
|
Packet4f res;
|
|
767
958
|
res.v4f[0] = vec_ceil(a.v4f[0]);
|
|
768
959
|
res.v4f[1] = vec_ceil(a.v4f[1]);
|
|
769
960
|
return res;
|
|
770
961
|
}
|
|
771
962
|
|
|
772
|
-
template<>
|
|
773
|
-
{
|
|
963
|
+
template <>
|
|
964
|
+
EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
|
|
774
965
|
Packet4f res;
|
|
775
966
|
res.v4f[0] = vec_floor(a.v4f[0]);
|
|
776
967
|
res.v4f[1] = vec_floor(a.v4f[1]);
|
|
777
968
|
return res;
|
|
778
969
|
}
|
|
779
970
|
|
|
780
|
-
template<>
|
|
781
|
-
{
|
|
971
|
+
template <>
|
|
972
|
+
EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
|
|
782
973
|
Packet4f p = pload<Packet4f>(from);
|
|
783
974
|
p.v4f[1] = vec_splat(p.v4f[0], 1);
|
|
784
975
|
p.v4f[0] = vec_splat(p.v4f[0], 0);
|
|
785
976
|
return p;
|
|
786
977
|
}
|
|
787
978
|
|
|
788
|
-
template<>
|
|
979
|
+
template <>
|
|
980
|
+
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
|
981
|
+
EIGEN_ALIGN16 float x[2];
|
|
982
|
+
vec_st2f(a.v4f[0], &x[0]);
|
|
983
|
+
return x[0];
|
|
984
|
+
}
|
|
789
985
|
|
|
790
|
-
template<>
|
|
791
|
-
{
|
|
986
|
+
template <>
|
|
987
|
+
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
|
792
988
|
Packet4f rev;
|
|
793
989
|
rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
|
|
794
990
|
rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
|
|
795
991
|
return rev;
|
|
796
992
|
}
|
|
797
993
|
|
|
798
|
-
template<>
|
|
799
|
-
{
|
|
994
|
+
template <>
|
|
995
|
+
EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
|
|
800
996
|
Packet4f res;
|
|
801
997
|
res.v4f[0] = pabs(a.v4f[0]);
|
|
802
998
|
res.v4f[1] = pabs(a.v4f[1]);
|
|
803
999
|
return res;
|
|
804
1000
|
}
|
|
805
1001
|
|
|
806
|
-
template<>
|
|
807
|
-
{
|
|
1002
|
+
template <>
|
|
1003
|
+
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
|
|
808
1004
|
Packet2d sum;
|
|
809
1005
|
sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
|
|
810
1006
|
double first = predux<Packet2d>(sum);
|
|
811
1007
|
return static_cast<float>(first);
|
|
812
1008
|
}
|
|
813
1009
|
|
|
814
|
-
template<>
|
|
815
|
-
{
|
|
1010
|
+
template <>
|
|
1011
|
+
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
|
|
816
1012
|
// Return predux_mul<Packet2d> of the subvectors product
|
|
817
1013
|
return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
|
|
818
1014
|
}
|
|
819
1015
|
|
|
820
|
-
template<>
|
|
821
|
-
{
|
|
1016
|
+
template <>
|
|
1017
|
+
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
|
|
822
1018
|
Packet2d b, res;
|
|
823
|
-
b
|
|
824
|
-
res = pmin<Packet2d>(
|
|
1019
|
+
b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
|
|
1020
|
+
res = pmin<Packet2d>(
|
|
1021
|
+
b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
|
825
1022
|
return static_cast<float>(pfirst(res));
|
|
826
1023
|
}
|
|
827
1024
|
|
|
828
|
-
template<>
|
|
829
|
-
{
|
|
1025
|
+
template <>
|
|
1026
|
+
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
|
|
830
1027
|
Packet2d b, res;
|
|
831
|
-
b
|
|
832
|
-
res = pmax<Packet2d>(
|
|
1028
|
+
b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
|
|
1029
|
+
res = pmax<Packet2d>(
|
|
1030
|
+
b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
|
833
1031
|
return static_cast<float>(pfirst(res));
|
|
834
1032
|
}
|
|
835
1033
|
|
|
836
1034
|
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
|
|
837
1035
|
*/
|
|
838
|
-
EIGEN_DEVICE_FUNC inline void
|
|
839
|
-
|
|
840
|
-
PacketBlock<Packet2d,2> t0,t1,t2,t3;
|
|
1036
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
|
1037
|
+
PacketBlock<Packet2d, 2> t0, t1, t2, t3;
|
|
841
1038
|
// copy top-left 2x2 Packet2d block
|
|
842
1039
|
t0.packet[0] = kernel.packet[0].v4f[0];
|
|
843
1040
|
t0.packet[1] = kernel.packet[1].v4f[0];
|
|
@@ -871,9 +1068,11 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
|
|
871
1068
|
kernel.packet[3].v4f[1] = t3.packet[1];
|
|
872
1069
|
}
|
|
873
1070
|
|
|
874
|
-
template<>
|
|
875
|
-
|
|
876
|
-
|
|
1071
|
+
template <>
|
|
1072
|
+
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
|
|
1073
|
+
const Packet4f& elsePacket) {
|
|
1074
|
+
Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
|
|
1075
|
+
Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
|
|
877
1076
|
Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
|
|
878
1077
|
Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
|
|
879
1078
|
Packet4f result;
|
|
@@ -882,24 +1081,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
|
|
|
882
1081
|
return result;
|
|
883
1082
|
}
|
|
884
1083
|
|
|
885
|
-
template<>
|
|
886
|
-
{
|
|
1084
|
+
template <>
|
|
1085
|
+
Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
887
1086
|
Packet4f res;
|
|
888
1087
|
res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
|
|
889
1088
|
res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
|
|
890
1089
|
return res;
|
|
891
1090
|
}
|
|
892
1091
|
|
|
893
|
-
template<>
|
|
894
|
-
{
|
|
1092
|
+
template <>
|
|
1093
|
+
Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
895
1094
|
Packet4f res;
|
|
896
1095
|
res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
|
|
897
1096
|
res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
|
|
898
1097
|
return res;
|
|
899
1098
|
}
|
|
900
1099
|
|
|
901
|
-
template<>
|
|
902
|
-
{
|
|
1100
|
+
template <>
|
|
1101
|
+
Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
903
1102
|
Packet4f res;
|
|
904
1103
|
res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
|
|
905
1104
|
res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
|
|
@@ -907,33 +1106,25 @@ template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, con
|
|
|
907
1106
|
}
|
|
908
1107
|
|
|
909
1108
|
#else
|
|
910
|
-
template<>
|
|
911
|
-
{
|
|
912
|
-
// FIXME: No intrinsic yet
|
|
1109
|
+
template <>
|
|
1110
|
+
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
|
|
913
1111
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
914
|
-
|
|
915
|
-
vfrom = (Packet *) from;
|
|
916
|
-
return vfrom->v4f;
|
|
1112
|
+
return vec_xl(0, from);
|
|
917
1113
|
}
|
|
918
1114
|
|
|
919
|
-
template<>
|
|
920
|
-
{
|
|
921
|
-
// FIXME: No intrinsic yet
|
|
1115
|
+
template <>
|
|
1116
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
|
|
922
1117
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
923
|
-
|
|
924
|
-
vto = (Packet *) to;
|
|
925
|
-
vto->v4f = from;
|
|
1118
|
+
vec_xst(from, 0, to);
|
|
926
1119
|
}
|
|
927
1120
|
|
|
928
|
-
template<>
|
|
929
|
-
{
|
|
1121
|
+
template <>
|
|
1122
|
+
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
|
930
1123
|
return vec_splats(from);
|
|
931
1124
|
}
|
|
932
1125
|
|
|
933
|
-
template<>
|
|
934
|
-
pbroadcast4<Packet4f>(const float
|
|
935
|
-
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
|
936
|
-
{
|
|
1126
|
+
template <>
|
|
1127
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
|
|
937
1128
|
a3 = pload<Packet4f>(a);
|
|
938
1129
|
a0 = vec_splat(a3, 0);
|
|
939
1130
|
a1 = vec_splat(a3, 1);
|
|
@@ -941,95 +1132,152 @@ pbroadcast4<Packet4f>(const float *a,
|
|
|
941
1132
|
a3 = vec_splat(a3, 3);
|
|
942
1133
|
}
|
|
943
1134
|
|
|
944
|
-
template<>
|
|
945
|
-
{
|
|
946
|
-
float
|
|
947
|
-
af[0] = from[0*stride];
|
|
948
|
-
af[1] = from[1*stride];
|
|
949
|
-
af[2] = from[2*stride];
|
|
950
|
-
af[3] = from[3*stride];
|
|
951
|
-
|
|
1135
|
+
template <>
|
|
1136
|
+
EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
|
1137
|
+
EIGEN_ALIGN16 float af[4];
|
|
1138
|
+
af[0] = from[0 * stride];
|
|
1139
|
+
af[1] = from[1 * stride];
|
|
1140
|
+
af[2] = from[2 * stride];
|
|
1141
|
+
af[3] = from[3 * stride];
|
|
1142
|
+
return pload<Packet4f>(af);
|
|
952
1143
|
}
|
|
953
1144
|
|
|
954
|
-
template<>
|
|
955
|
-
{
|
|
956
|
-
float
|
|
1145
|
+
template <>
|
|
1146
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
|
1147
|
+
EIGEN_ALIGN16 float af[4];
|
|
957
1148
|
pstore<float>((float*)af, from);
|
|
958
|
-
to[0*stride] = af[0];
|
|
959
|
-
to[1*stride] = af[1];
|
|
960
|
-
to[2*stride] = af[2];
|
|
961
|
-
to[3*stride] = af[3];
|
|
962
|
-
}
|
|
963
|
-
|
|
964
|
-
template<>
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
template<>
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
template<>
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
template<>
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
template<>
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1149
|
+
to[0 * stride] = af[0];
|
|
1150
|
+
to[1 * stride] = af[1];
|
|
1151
|
+
to[2 * stride] = af[2];
|
|
1152
|
+
to[3 * stride] = af[3];
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
template <>
|
|
1156
|
+
EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1157
|
+
return (a + b);
|
|
1158
|
+
}
|
|
1159
|
+
template <>
|
|
1160
|
+
EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1161
|
+
return (a - b);
|
|
1162
|
+
}
|
|
1163
|
+
template <>
|
|
1164
|
+
EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1165
|
+
return (a * b);
|
|
1166
|
+
}
|
|
1167
|
+
template <>
|
|
1168
|
+
EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1169
|
+
return (a / b);
|
|
1170
|
+
}
|
|
1171
|
+
template <>
|
|
1172
|
+
EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
|
|
1173
|
+
return (-a);
|
|
1174
|
+
}
|
|
1175
|
+
template <>
|
|
1176
|
+
EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
|
|
1177
|
+
return a;
|
|
1178
|
+
}
|
|
1179
|
+
template <>
|
|
1180
|
+
EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
1181
|
+
return vec_madd(a, b, c);
|
|
1182
|
+
}
|
|
1183
|
+
template <>
|
|
1184
|
+
EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1185
|
+
return vec_min(a, b);
|
|
1186
|
+
}
|
|
1187
|
+
template <>
|
|
1188
|
+
EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1189
|
+
return vec_max(a, b);
|
|
1190
|
+
}
|
|
1191
|
+
template <>
|
|
1192
|
+
EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1193
|
+
return vec_and(a, b);
|
|
1194
|
+
}
|
|
1195
|
+
template <>
|
|
1196
|
+
EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1197
|
+
return vec_or(a, b);
|
|
1198
|
+
}
|
|
1199
|
+
template <>
|
|
1200
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1201
|
+
return vec_xor(a, b);
|
|
1202
|
+
}
|
|
1203
|
+
template <>
|
|
1204
|
+
EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1205
|
+
return vec_and(a, vec_nor(b, b));
|
|
1206
|
+
}
|
|
1207
|
+
template <>
|
|
1208
|
+
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
|
|
1209
|
+
/* Uses non-default rounding for vec_round */
|
|
1210
|
+
return __builtin_s390_vfisb(a, 0, 1);
|
|
1211
|
+
}
|
|
1212
|
+
template <>
|
|
1213
|
+
EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
|
|
1214
|
+
return vec_ceil(a);
|
|
1215
|
+
}
|
|
1216
|
+
template <>
|
|
1217
|
+
EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
|
|
1218
|
+
return vec_floor(a);
|
|
1219
|
+
}
|
|
1220
|
+
template <>
|
|
1221
|
+
EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
|
|
1222
|
+
return vec_abs(a);
|
|
1223
|
+
}
|
|
1224
|
+
template <>
|
|
1225
|
+
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
|
1226
|
+
EIGEN_ALIGN16 float x[4];
|
|
1227
|
+
pstore(x, a);
|
|
1228
|
+
return x[0];
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
template <>
|
|
1232
|
+
EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
|
|
985
1233
|
Packet4f p = pload<Packet4f>(from);
|
|
986
1234
|
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
|
987
1235
|
}
|
|
988
1236
|
|
|
989
|
-
template<>
|
|
990
|
-
{
|
|
991
|
-
return reinterpret_cast<Packet4f>(
|
|
1237
|
+
template <>
|
|
1238
|
+
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
|
1239
|
+
return reinterpret_cast<Packet4f>(
|
|
1240
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
992
1241
|
}
|
|
993
1242
|
|
|
994
|
-
template<>
|
|
995
|
-
{
|
|
1243
|
+
template <>
|
|
1244
|
+
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
|
|
996
1245
|
Packet4f b, sum;
|
|
997
|
-
b
|
|
1246
|
+
b = vec_sld(a, a, 8);
|
|
998
1247
|
sum = padd<Packet4f>(a, b);
|
|
999
|
-
b
|
|
1248
|
+
b = vec_sld(sum, sum, 4);
|
|
1000
1249
|
sum = padd<Packet4f>(sum, b);
|
|
1001
1250
|
return pfirst(sum);
|
|
1002
1251
|
}
|
|
1003
1252
|
|
|
1004
1253
|
// Other reduction functions:
|
|
1005
1254
|
// mul
|
|
1006
|
-
template<>
|
|
1007
|
-
{
|
|
1255
|
+
template <>
|
|
1256
|
+
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
|
|
1008
1257
|
Packet4f prod;
|
|
1009
1258
|
prod = pmul(a, vec_sld(a, a, 8));
|
|
1010
1259
|
return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
|
|
1011
1260
|
}
|
|
1012
1261
|
|
|
1013
1262
|
// min
|
|
1014
|
-
template<>
|
|
1015
|
-
{
|
|
1263
|
+
template <>
|
|
1264
|
+
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
|
|
1016
1265
|
Packet4f b, res;
|
|
1017
|
-
b
|
|
1266
|
+
b = pmin<Packet4f>(a, vec_sld(a, a, 8));
|
|
1018
1267
|
res = pmin<Packet4f>(b, vec_sld(b, b, 4));
|
|
1019
1268
|
return pfirst(res);
|
|
1020
1269
|
}
|
|
1021
1270
|
|
|
1022
1271
|
// max
|
|
1023
|
-
template<>
|
|
1024
|
-
{
|
|
1272
|
+
template <>
|
|
1273
|
+
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
|
|
1025
1274
|
Packet4f b, res;
|
|
1026
1275
|
b = pmax<Packet4f>(a, vec_sld(a, a, 8));
|
|
1027
1276
|
res = pmax<Packet4f>(b, vec_sld(b, b, 4));
|
|
1028
1277
|
return pfirst(res);
|
|
1029
1278
|
}
|
|
1030
1279
|
|
|
1031
|
-
EIGEN_DEVICE_FUNC inline void
|
|
1032
|
-
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
|
1280
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
|
1033
1281
|
Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1034
1282
|
Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
1035
1283
|
Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
|
@@ -1040,21 +1288,126 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
|
|
1040
1288
|
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1041
1289
|
}
|
|
1042
1290
|
|
|
1043
|
-
template<>
|
|
1044
|
-
|
|
1291
|
+
template <>
|
|
1292
|
+
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
|
|
1293
|
+
const Packet4f& elsePacket) {
|
|
1294
|
+
Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
|
|
1045
1295
|
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
|
1046
1296
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
1047
1297
|
}
|
|
1048
1298
|
|
|
1049
1299
|
#endif
|
|
1050
1300
|
|
|
1051
|
-
template<>
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1301
|
+
template <>
|
|
1302
|
+
EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
|
|
1303
|
+
return pldexp_generic(a, exponent);
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
template <>
|
|
1307
|
+
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
|
|
1308
|
+
// Clamp exponent to [-2099, 2099]
|
|
1309
|
+
const Packet2d max_exponent = pset1<Packet2d>(2099.0);
|
|
1310
|
+
const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
|
|
1311
|
+
|
|
1312
|
+
// Split 2^e into four factors and multiply:
|
|
1313
|
+
const Packet2l bias = {1023, 1023};
|
|
1314
|
+
Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
|
|
1315
|
+
Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
|
|
1316
|
+
Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
|
|
1317
|
+
b = psub(psub(psub(e, b), b), b); // e - 3b
|
|
1318
|
+
c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
|
|
1319
|
+
out = pmul(out, c); // a * 2^e
|
|
1320
|
+
return out;
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
template <>
|
|
1324
|
+
EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
|
|
1325
|
+
EIGEN_ZVECTOR_PREFETCH(addr);
|
|
1326
|
+
}
|
|
1327
|
+
template <>
|
|
1328
|
+
EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
|
1329
|
+
return pload<Packet4f>(from);
|
|
1330
|
+
}
|
|
1331
|
+
template <>
|
|
1332
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
|
|
1333
|
+
pstore<float>(to, from);
|
|
1334
|
+
}
|
|
1335
|
+
template <>
|
|
1336
|
+
EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
|
|
1337
|
+
return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
#if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
|
|
1341
|
+
#pragma GCC warning \
|
|
1342
|
+
"float->int and int->float conversion is simulated. compile for z15 for improved performance"
|
|
1343
|
+
template <>
|
|
1344
|
+
struct cast_impl<Packet4i, Packet4f> {
|
|
1345
|
+
EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
|
|
1346
|
+
return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3]) };
|
|
1347
|
+
}
|
|
1348
|
+
};
|
|
1349
|
+
|
|
1350
|
+
template <>
|
|
1351
|
+
struct cast_impl<Packet4f, Packet4i> {
|
|
1352
|
+
EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
|
|
1353
|
+
return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3]) };
|
|
1354
|
+
}
|
|
1355
|
+
};
|
|
1356
|
+
|
|
1357
|
+
template <>
|
|
1358
|
+
struct cast_impl<Packet2l, Packet2d> {
|
|
1359
|
+
EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
|
|
1360
|
+
return Packet2d{double(a[0]), double(a[1]) };
|
|
1361
|
+
}
|
|
1362
|
+
};
|
|
1363
|
+
|
|
1364
|
+
template <>
|
|
1365
|
+
struct cast_impl<Packet2d, Packet2l> {
|
|
1366
|
+
EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
|
|
1367
|
+
return Packet2l{(long long)(a[0]), (long long)(a[1]) };
|
|
1368
|
+
}
|
|
1369
|
+
};
|
|
1370
|
+
#else
|
|
1371
|
+
template <>
|
|
1372
|
+
struct cast_impl<Packet4i, Packet4f> {
|
|
1373
|
+
EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
|
|
1374
|
+
return vec_float(a);
|
|
1375
|
+
}
|
|
1376
|
+
};
|
|
1377
|
+
|
|
1378
|
+
template <>
|
|
1379
|
+
struct cast_impl<Packet4f, Packet4i> {
|
|
1380
|
+
EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
|
|
1381
|
+
return vec_signed(a);
|
|
1382
|
+
}
|
|
1383
|
+
};
|
|
1384
|
+
|
|
1385
|
+
template <>
|
|
1386
|
+
struct cast_impl<Packet2l, Packet2d> {
|
|
1387
|
+
EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
|
|
1388
|
+
return vec_double(a);
|
|
1389
|
+
}
|
|
1390
|
+
};
|
|
1391
|
+
|
|
1392
|
+
template <>
|
|
1393
|
+
struct cast_impl<Packet2d, Packet2l> {
|
|
1394
|
+
EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
|
|
1395
|
+
return vec_signed(a);
|
|
1396
|
+
}
|
|
1397
|
+
};
|
|
1398
|
+
#endif
|
|
1399
|
+
|
|
1400
|
+
template <>
|
|
1401
|
+
EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
|
|
1402
|
+
return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
|
|
1403
|
+
}
|
|
1404
|
+
template <>
|
|
1405
|
+
EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
|
|
1406
|
+
return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
|
|
1407
|
+
}
|
|
1055
1408
|
|
|
1056
|
-
}
|
|
1409
|
+
} // end namespace internal
|
|
1057
1410
|
|
|
1058
|
-
}
|
|
1411
|
+
} // end namespace Eigen
|
|
1059
1412
|
|
|
1060
|
-
#endif
|
|
1413
|
+
#endif // EIGEN_PACKET_MATH_ZVECTOR_H
|