@smake/eigen 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/eigen/Eigen/AccelerateSupport +52 -0
- package/eigen/Eigen/Cholesky +18 -21
- package/eigen/Eigen/CholmodSupport +28 -28
- package/eigen/Eigen/Core +235 -326
- package/eigen/Eigen/Eigenvalues +16 -14
- package/eigen/Eigen/Geometry +21 -24
- package/eigen/Eigen/Householder +9 -8
- package/eigen/Eigen/IterativeLinearSolvers +8 -4
- package/eigen/Eigen/Jacobi +14 -14
- package/eigen/Eigen/KLUSupport +43 -0
- package/eigen/Eigen/LU +16 -20
- package/eigen/Eigen/MetisSupport +12 -12
- package/eigen/Eigen/OrderingMethods +54 -54
- package/eigen/Eigen/PaStiXSupport +23 -20
- package/eigen/Eigen/PardisoSupport +17 -14
- package/eigen/Eigen/QR +18 -21
- package/eigen/Eigen/QtAlignedMalloc +5 -13
- package/eigen/Eigen/SPQRSupport +21 -14
- package/eigen/Eigen/SVD +23 -18
- package/eigen/Eigen/Sparse +1 -4
- package/eigen/Eigen/SparseCholesky +18 -23
- package/eigen/Eigen/SparseCore +18 -17
- package/eigen/Eigen/SparseLU +12 -8
- package/eigen/Eigen/SparseQR +16 -14
- package/eigen/Eigen/StdDeque +5 -2
- package/eigen/Eigen/StdList +5 -2
- package/eigen/Eigen/StdVector +5 -2
- package/eigen/Eigen/SuperLUSupport +30 -24
- package/eigen/Eigen/ThreadPool +80 -0
- package/eigen/Eigen/UmfPackSupport +19 -17
- package/eigen/Eigen/Version +14 -0
- package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
- package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
- package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
- package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
- package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
- package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
- package/eigen/Eigen/src/Core/Array.h +341 -294
- package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
- package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
- package/eigen/Eigen/src/Core/Assign.h +30 -40
- package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
- package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
- package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
- package/eigen/Eigen/src/Core/Block.h +375 -398
- package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
- package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
- package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
- package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
- package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
- package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
- package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
- package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
- package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
- package/eigen/Eigen/src/Core/DenseBase.h +632 -571
- package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
- package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
- package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
- package/eigen/Eigen/src/Core/Diagonal.h +169 -210
- package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
- package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
- package/eigen/Eigen/src/Core/Dot.h +172 -222
- package/eigen/Eigen/src/Core/EigenBase.h +75 -85
- package/eigen/Eigen/src/Core/Fill.h +138 -0
- package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
- package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
- package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
- package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
- package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
- package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
- package/eigen/Eigen/src/Core/IO.h +147 -139
- package/eigen/Eigen/src/Core/IndexedView.h +321 -0
- package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
- package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Core/Inverse.h +56 -66
- package/eigen/Eigen/src/Core/Map.h +124 -142
- package/eigen/Eigen/src/Core/MapBase.h +256 -281
- package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
- package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
- package/eigen/Eigen/src/Core/Matrix.h +491 -416
- package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
- package/eigen/Eigen/src/Core/NestByValue.h +66 -85
- package/eigen/Eigen/src/Core/NoAlias.h +79 -85
- package/eigen/Eigen/src/Core/NumTraits.h +235 -148
- package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
- package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
- package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
- package/eigen/Eigen/src/Core/Product.h +260 -139
- package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
- package/eigen/Eigen/src/Core/Random.h +161 -136
- package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
- package/eigen/Eigen/src/Core/RealView.h +250 -0
- package/eigen/Eigen/src/Core/Redux.h +366 -336
- package/eigen/Eigen/src/Core/Ref.h +308 -209
- package/eigen/Eigen/src/Core/Replicate.h +94 -106
- package/eigen/Eigen/src/Core/Reshaped.h +398 -0
- package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
- package/eigen/Eigen/src/Core/Reverse.h +136 -145
- package/eigen/Eigen/src/Core/Select.h +70 -140
- package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
- package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
- package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
- package/eigen/Eigen/src/Core/Solve.h +97 -111
- package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
- package/eigen/Eigen/src/Core/SolverBase.h +138 -101
- package/eigen/Eigen/src/Core/StableNorm.h +156 -160
- package/eigen/Eigen/src/Core/StlIterators.h +619 -0
- package/eigen/Eigen/src/Core/Stride.h +91 -88
- package/eigen/Eigen/src/Core/Swap.h +70 -38
- package/eigen/Eigen/src/Core/Transpose.h +295 -273
- package/eigen/Eigen/src/Core/Transpositions.h +272 -317
- package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
- package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
- package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
- package/eigen/Eigen/src/Core/Visitor.h +480 -216
- package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
- package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
- package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
- package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
- package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
- package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
- package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
- package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
- package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
- package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
- package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
- package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
- package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
- package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
- package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
- package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
- package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
- package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
- package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
- package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
- package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
- package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
- package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
- package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
- package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
- package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
- package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
- package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
- package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
- package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
- package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
- package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
- package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
- package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
- package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
- package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
- package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
- package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
- package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
- package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
- package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
- package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
- package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
- package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
- package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
- package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
- package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
- package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
- package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
- package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
- package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
- package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
- package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
- package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
- package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
- package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
- package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
- package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
- package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
- package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
- package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
- package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
- package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
- package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
- package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
- package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
- package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
- package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
- package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
- package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
- package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
- package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
- package/eigen/Eigen/src/Core/util/Assert.h +158 -0
- package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
- package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
- package/eigen/Eigen/src/Core/util/Constants.h +314 -263
- package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
- package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
- package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
- package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
- package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
- package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
- package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
- package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
- package/eigen/Eigen/src/Core/util/Macros.h +939 -646
- package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
- package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
- package/eigen/Eigen/src/Core/util/Meta.h +618 -426
- package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
- package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
- package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
- package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
- package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
- package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
- package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
- package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
- package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
- package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
- package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
- package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
- package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
- package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
- package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
- package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
- package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
- package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
- package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
- package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
- package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
- package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
- package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
- package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
- package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
- package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
- package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
- package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
- package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
- package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
- package/eigen/Eigen/src/Geometry/Transform.h +896 -953
- package/eigen/Eigen/src/Geometry/Translation.h +100 -98
- package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
- package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
- package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
- package/eigen/Eigen/src/Householder/Householder.h +104 -122
- package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
- package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
- package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
- package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
- package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
- package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
- package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
- package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
- package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
- package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
- package/eigen/Eigen/src/LU/Determinant.h +60 -63
- package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
- package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
- package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
- package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
- package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
- package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
- package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
- package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
- package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
- package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
- package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
- package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
- package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
- package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
- package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
- package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
- package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
- package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
- package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
- package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
- package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
- package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
- package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
- package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
- package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
- package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
- package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
- package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
- package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
- package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
- package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
- package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
- package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
- package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
- package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
- package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
- package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
- package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
- package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
- package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
- package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
- package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
- package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
- package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
- package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
- package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
- package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
- package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
- package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
- package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
- package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
- package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
- package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
- package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
- package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
- package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
- package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
- package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
- package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
- package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
- package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
- package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
- package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
- package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
- package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
- package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
- package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
- package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
- package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
- package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
- package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
- package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
- package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
- package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
- package/eigen/Eigen/src/StlSupport/details.h +48 -50
- package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
- package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
- package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
- package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
- package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
- package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
- package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
- package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
- package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
- package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
- package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
- package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
- package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
- package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
- package/eigen/Eigen/src/misc/Image.h +41 -43
- package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
- package/eigen/Eigen/src/misc/Kernel.h +39 -41
- package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
- package/eigen/Eigen/src/misc/blas.h +83 -426
- package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
- package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
- package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
- package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
- package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
- package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
- package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
- package/lib/LibEigen.d.ts +4 -0
- package/lib/LibEigen.js +14 -0
- package/lib/index.d.ts +1 -1
- package/lib/index.js +7 -3
- package/package.json +2 -10
- package/eigen/Eigen/CMakeLists.txt +0 -19
- package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
- package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
- package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
- package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
- package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
- package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
- package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
- package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
- package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
- package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
- package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
- package/eigen/Eigen/src/misc/lapack.h +0 -152
- package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
- package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
- package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
- package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
- package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
- package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
- package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
- package/lib/eigen.d.ts +0 -2
- package/lib/eigen.js +0 -15
|
@@ -10,342 +10,791 @@
|
|
|
10
10
|
#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
|
|
11
11
|
#define EIGEN_PACKET_MATH_ZVECTOR_H
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
// IWYU pragma: private
|
|
14
|
+
#include "../../InternalHeaderCheck.h"
|
|
14
15
|
|
|
15
16
|
namespace Eigen {
|
|
16
17
|
|
|
17
18
|
namespace internal {
|
|
18
19
|
|
|
19
20
|
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
|
20
|
-
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
|
21
|
+
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
|
|
21
22
|
#endif
|
|
22
23
|
|
|
23
24
|
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
24
25
|
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
|
25
26
|
#endif
|
|
26
27
|
|
|
27
|
-
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
|
28
|
-
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
|
29
|
-
#endif
|
|
30
|
-
|
|
31
28
|
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
32
|
-
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
|
29
|
+
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
|
|
33
30
|
#endif
|
|
34
31
|
|
|
35
|
-
typedef __vector int
|
|
36
|
-
typedef __vector unsigned int
|
|
37
|
-
typedef __vector __bool int
|
|
38
|
-
typedef __vector short int
|
|
39
|
-
typedef __vector unsigned char
|
|
40
|
-
typedef __vector double
|
|
41
|
-
typedef __vector unsigned long long
|
|
42
|
-
typedef __vector long long
|
|
43
|
-
|
|
32
|
+
typedef __vector int Packet4i;
|
|
33
|
+
typedef __vector unsigned int Packet4ui;
|
|
34
|
+
typedef __vector __bool int Packet4bi;
|
|
35
|
+
typedef __vector short int Packet8i;
|
|
36
|
+
typedef __vector unsigned char Packet16uc;
|
|
37
|
+
typedef __vector double Packet2d;
|
|
38
|
+
typedef __vector unsigned long long Packet2ul;
|
|
39
|
+
typedef __vector long long Packet2l;
|
|
40
|
+
|
|
41
|
+
// Z14 has builtin support for float vectors
|
|
42
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
43
|
+
typedef __vector float Packet4f;
|
|
44
|
+
#else
|
|
44
45
|
typedef struct {
|
|
45
|
-
|
|
46
|
+
Packet2d v4f[2];
|
|
46
47
|
} Packet4f;
|
|
48
|
+
#endif
|
|
47
49
|
|
|
48
50
|
typedef union {
|
|
49
|
-
int32_t
|
|
50
|
-
uint32_t ui[4];
|
|
51
|
-
int64_t
|
|
52
|
-
uint64_t ul[2];
|
|
53
|
-
double
|
|
54
|
-
|
|
51
|
+
numext::int32_t i[4];
|
|
52
|
+
numext::uint32_t ui[4];
|
|
53
|
+
numext::int64_t l[2];
|
|
54
|
+
numext::uint64_t ul[2];
|
|
55
|
+
double d[2];
|
|
56
|
+
float f[4];
|
|
57
|
+
Packet4i v4i;
|
|
55
58
|
Packet4ui v4ui;
|
|
56
|
-
Packet2l
|
|
59
|
+
Packet2l v2l;
|
|
57
60
|
Packet2ul v2ul;
|
|
58
|
-
Packet2d
|
|
61
|
+
Packet2d v2d;
|
|
62
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
63
|
+
Packet4f v4f;
|
|
64
|
+
#endif
|
|
59
65
|
} Packet;
|
|
60
66
|
|
|
61
67
|
// We don't want to write the same code all the time, but we need to reuse the constants
|
|
62
68
|
// and it doesn't really work to declare them global, so we define macros instead
|
|
63
69
|
|
|
64
|
-
#define
|
|
65
|
-
Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
|
|
70
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
|
|
66
71
|
|
|
67
|
-
#define
|
|
68
|
-
Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
|
|
72
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
|
|
69
73
|
|
|
70
|
-
#define
|
|
71
|
-
Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
|
|
74
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
|
|
72
75
|
|
|
73
|
-
#define
|
|
74
|
-
Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
|
76
|
+
#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
|
75
77
|
|
|
76
|
-
#define
|
|
77
|
-
Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
|
78
|
+
#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
|
|
78
79
|
|
|
79
|
-
#define
|
|
80
|
-
Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
|
80
|
+
#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
|
|
81
81
|
|
|
82
82
|
// These constants are endian-agnostic
|
|
83
|
-
|
|
84
|
-
static
|
|
83
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
|
|
84
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
|
|
85
85
|
|
|
86
|
-
static
|
|
87
|
-
static
|
|
88
|
-
static
|
|
86
|
+
static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
|
|
87
|
+
static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
|
|
88
|
+
static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
|
|
89
89
|
|
|
90
|
-
static Packet2d p2d_ONE = {
|
|
91
|
-
static Packet2d p2d_ZERO_ = {
|
|
90
|
+
static Packet2d p2d_ONE = {1.0, 1.0};
|
|
91
|
+
static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
|
|
92
|
+
numext::bit_cast<double>(0x8000000000000000ull)};
|
|
92
93
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
|
|
94
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
95
|
+
#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
|
|
99
|
-
|
|
100
|
-
// Mask alignment
|
|
101
|
-
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
|
|
97
|
+
#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
|
102
98
|
|
|
103
|
-
#define
|
|
99
|
+
#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
|
|
100
|
+
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
|
|
104
101
|
|
|
105
|
-
|
|
106
|
-
//
|
|
102
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
|
|
103
|
+
static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
|
|
104
|
+
static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
|
|
105
|
+
#endif
|
|
107
106
|
|
|
108
|
-
static
|
|
109
|
-
static
|
|
110
|
-
static
|
|
107
|
+
static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
|
|
108
|
+
static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
|
|
109
|
+
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
|
|
110
|
+
vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
|
|
111
111
|
|
|
112
|
-
static Packet16uc
|
|
113
|
-
static Packet16uc
|
|
114
|
-
/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
|
112
|
+
static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
|
|
113
|
+
static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
|
|
115
114
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
|
119
|
-
static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
|
|
120
|
-
static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
|
121
|
-
static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
|
|
115
|
+
// Mask alignment
|
|
116
|
+
#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
|
|
122
117
|
|
|
123
|
-
|
|
118
|
+
#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
|
|
124
119
|
|
|
125
|
-
//
|
|
120
|
+
// Handle endianness properly while loading constants
|
|
121
|
+
// Define global static constants:
|
|
126
122
|
|
|
123
|
+
static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
124
|
+
static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
|
|
125
|
+
static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
|
|
126
|
+
|
|
127
|
+
static Packet16uc p16uc_PSET32_WODD =
|
|
128
|
+
vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
|
|
129
|
+
8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
|
130
|
+
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
|
|
131
|
+
8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
|
132
|
+
/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
|
|
133
|
+
8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
|
134
|
+
|
|
135
|
+
static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
|
|
136
|
+
(Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
|
|
137
|
+
static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
|
|
138
|
+
(Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
|
|
139
|
+
/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
|
|
140
|
+
16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
|
|
141
|
+
8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
|
|
142
|
+
static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
|
143
|
+
static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
144
|
+
|
|
145
|
+
static Packet16uc p16uc_COMPLEX32_REV =
|
|
146
|
+
vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
|
|
147
|
+
|
|
148
|
+
static Packet16uc p16uc_COMPLEX32_REV2 =
|
|
149
|
+
vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
|
127
150
|
|
|
128
151
|
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
|
129
|
-
|
|
152
|
+
#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
|
|
130
153
|
#else
|
|
131
|
-
|
|
154
|
+
#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm(" pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
|
|
132
155
|
#endif
|
|
133
156
|
|
|
134
|
-
template<>
|
|
135
|
-
{
|
|
157
|
+
template <>
|
|
158
|
+
struct packet_traits<int> : default_packet_traits {
|
|
136
159
|
typedef Packet4i type;
|
|
137
160
|
typedef Packet4i half;
|
|
138
161
|
enum {
|
|
139
162
|
Vectorizable = 1,
|
|
140
163
|
AlignedOnScalar = 1,
|
|
141
164
|
size = 4,
|
|
142
|
-
HasHalfPacket = 0,
|
|
143
165
|
|
|
144
|
-
HasAdd
|
|
145
|
-
HasSub
|
|
146
|
-
HasMul
|
|
147
|
-
HasDiv
|
|
166
|
+
HasAdd = 1,
|
|
167
|
+
HasSub = 1,
|
|
168
|
+
HasMul = 1,
|
|
169
|
+
HasDiv = 1,
|
|
148
170
|
HasBlend = 1
|
|
149
171
|
};
|
|
150
172
|
};
|
|
151
173
|
|
|
152
|
-
template<>
|
|
153
|
-
{
|
|
174
|
+
template <>
|
|
175
|
+
struct packet_traits<float> : default_packet_traits {
|
|
154
176
|
typedef Packet4f type;
|
|
155
177
|
typedef Packet4f half;
|
|
156
178
|
enum {
|
|
157
179
|
Vectorizable = 1,
|
|
158
180
|
AlignedOnScalar = 1,
|
|
159
|
-
size=4,
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
HasAdd
|
|
163
|
-
HasSub
|
|
164
|
-
HasMul
|
|
165
|
-
HasDiv
|
|
166
|
-
HasMin
|
|
167
|
-
HasMax
|
|
168
|
-
HasAbs
|
|
169
|
-
HasSin
|
|
170
|
-
HasCos
|
|
171
|
-
HasLog
|
|
172
|
-
HasExp
|
|
181
|
+
size = 4,
|
|
182
|
+
|
|
183
|
+
HasCmp = 1,
|
|
184
|
+
HasAdd = 1,
|
|
185
|
+
HasSub = 1,
|
|
186
|
+
HasMul = 1,
|
|
187
|
+
HasDiv = 1,
|
|
188
|
+
HasMin = 1,
|
|
189
|
+
HasMax = 1,
|
|
190
|
+
HasAbs = 1,
|
|
191
|
+
HasSin = 0,
|
|
192
|
+
HasCos = 0,
|
|
193
|
+
HasLog = 0,
|
|
194
|
+
HasExp = 1,
|
|
173
195
|
HasSqrt = 1,
|
|
174
196
|
HasRsqrt = 1,
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
HasCeil = 1,
|
|
197
|
+
HasTanh = 1,
|
|
198
|
+
HasErf = 1,
|
|
178
199
|
HasNegate = 1,
|
|
179
200
|
HasBlend = 1
|
|
180
201
|
};
|
|
181
202
|
};
|
|
182
203
|
|
|
183
|
-
template<>
|
|
184
|
-
{
|
|
204
|
+
template <>
|
|
205
|
+
struct packet_traits<double> : default_packet_traits {
|
|
185
206
|
typedef Packet2d type;
|
|
186
207
|
typedef Packet2d half;
|
|
187
208
|
enum {
|
|
188
209
|
Vectorizable = 1,
|
|
189
210
|
AlignedOnScalar = 1,
|
|
190
|
-
size=2,
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
HasExp = 1,
|
|
211
|
+
size = 2,
|
|
212
|
+
|
|
213
|
+
HasAdd = 1,
|
|
214
|
+
HasSub = 1,
|
|
215
|
+
HasMul = 1,
|
|
216
|
+
HasDiv = 1,
|
|
217
|
+
HasMin = 1,
|
|
218
|
+
HasMax = 1,
|
|
219
|
+
HasAbs = 1,
|
|
220
|
+
HasSin = 0,
|
|
221
|
+
HasCos = 0,
|
|
222
|
+
HasLog = 0,
|
|
223
|
+
HasExp = 1,
|
|
204
224
|
HasSqrt = 1,
|
|
205
225
|
HasRsqrt = 1,
|
|
206
|
-
HasRound = 1,
|
|
207
|
-
HasFloor = 1,
|
|
208
|
-
HasCeil = 1,
|
|
209
226
|
HasNegate = 1,
|
|
210
227
|
HasBlend = 1
|
|
211
228
|
};
|
|
212
229
|
};
|
|
213
230
|
|
|
214
|
-
template<>
|
|
215
|
-
|
|
216
|
-
|
|
231
|
+
template <>
|
|
232
|
+
struct unpacket_traits<Packet4i> {
|
|
233
|
+
typedef int type;
|
|
234
|
+
enum {
|
|
235
|
+
size = 4,
|
|
236
|
+
alignment = Aligned16,
|
|
237
|
+
vectorizable = true,
|
|
238
|
+
masked_load_available = false,
|
|
239
|
+
masked_store_available = false
|
|
240
|
+
};
|
|
241
|
+
typedef Packet4i half;
|
|
242
|
+
};
|
|
243
|
+
template <>
|
|
244
|
+
struct unpacket_traits<Packet4f> {
|
|
245
|
+
typedef float type;
|
|
246
|
+
enum {
|
|
247
|
+
size = 4,
|
|
248
|
+
alignment = Aligned16,
|
|
249
|
+
vectorizable = true,
|
|
250
|
+
masked_load_available = false,
|
|
251
|
+
masked_store_available = false
|
|
252
|
+
};
|
|
253
|
+
typedef Packet4f half;
|
|
254
|
+
typedef Packet4i integer_packet;
|
|
255
|
+
};
|
|
256
|
+
template <>
|
|
257
|
+
struct unpacket_traits<Packet2d> {
|
|
258
|
+
typedef double type;
|
|
259
|
+
enum {
|
|
260
|
+
size = 2,
|
|
261
|
+
alignment = Aligned16,
|
|
262
|
+
vectorizable = true,
|
|
263
|
+
masked_load_available = false,
|
|
264
|
+
masked_store_available = false
|
|
265
|
+
};
|
|
266
|
+
typedef Packet2d half;
|
|
267
|
+
typedef Packet2l integer_packet;
|
|
268
|
+
};
|
|
217
269
|
|
|
218
270
|
/* Forward declaration */
|
|
219
|
-
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
|
|
220
|
-
|
|
221
|
-
inline std::ostream
|
|
222
|
-
{
|
|
271
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
|
|
272
|
+
|
|
273
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
|
|
223
274
|
Packet vt;
|
|
224
275
|
vt.v4i = v;
|
|
225
276
|
s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
|
|
226
277
|
return s;
|
|
227
278
|
}
|
|
228
279
|
|
|
229
|
-
inline std::ostream
|
|
230
|
-
{
|
|
280
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
|
|
231
281
|
Packet vt;
|
|
232
282
|
vt.v4ui = v;
|
|
233
283
|
s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
|
|
234
284
|
return s;
|
|
235
285
|
}
|
|
236
286
|
|
|
237
|
-
inline std::ostream
|
|
238
|
-
{
|
|
287
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
|
|
239
288
|
Packet vt;
|
|
240
289
|
vt.v2l = v;
|
|
241
290
|
s << vt.l[0] << ", " << vt.l[1];
|
|
242
291
|
return s;
|
|
243
292
|
}
|
|
244
293
|
|
|
245
|
-
inline std::ostream
|
|
246
|
-
{
|
|
294
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
|
|
247
295
|
Packet vt;
|
|
248
296
|
vt.v2ul = v;
|
|
249
|
-
s << vt.ul[0] << ", " << vt.ul[1]
|
|
297
|
+
s << vt.ul[0] << ", " << vt.ul[1];
|
|
250
298
|
return s;
|
|
251
299
|
}
|
|
252
300
|
|
|
253
|
-
inline std::ostream
|
|
254
|
-
{
|
|
301
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
|
|
255
302
|
Packet vt;
|
|
256
303
|
vt.v2d = v;
|
|
257
304
|
s << vt.d[0] << ", " << vt.d[1];
|
|
258
305
|
return s;
|
|
259
306
|
}
|
|
260
307
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
case 0:
|
|
268
|
-
splat.v4f[0] = vec_splat(from.v4f[0], 0);
|
|
269
|
-
splat.v4f[1] = splat.v4f[0];
|
|
270
|
-
break;
|
|
271
|
-
case 1:
|
|
272
|
-
splat.v4f[0] = vec_splat(from.v4f[0], 1);
|
|
273
|
-
splat.v4f[1] = splat.v4f[0];
|
|
274
|
-
break;
|
|
275
|
-
case 2:
|
|
276
|
-
splat.v4f[0] = vec_splat(from.v4f[1], 0);
|
|
277
|
-
splat.v4f[1] = splat.v4f[0];
|
|
278
|
-
break;
|
|
279
|
-
case 3:
|
|
280
|
-
splat.v4f[0] = vec_splat(from.v4f[1], 1);
|
|
281
|
-
splat.v4f[1] = splat.v4f[0];
|
|
282
|
-
break;
|
|
283
|
-
}
|
|
284
|
-
return splat;
|
|
308
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
|
|
309
|
+
inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
|
|
310
|
+
Packet vt;
|
|
311
|
+
vt.v4f = v;
|
|
312
|
+
s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
|
|
313
|
+
return s;
|
|
285
314
|
}
|
|
315
|
+
#endif
|
|
286
316
|
|
|
287
|
-
template
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
317
|
+
template <>
|
|
318
|
+
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
|
|
319
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
|
320
|
+
return vec_xl(0, from);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
template <>
|
|
324
|
+
EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
|
|
325
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
|
326
|
+
return vec_xl(0, from);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
template <>
|
|
330
|
+
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
|
|
331
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
332
|
+
vec_xst(from, 0, to);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
template <>
|
|
336
|
+
EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
|
|
337
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
338
|
+
vec_xst(from, 0, to);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
template <>
|
|
342
|
+
EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
|
|
343
|
+
return pfrexp_generic(a, exponent);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
template <>
|
|
347
|
+
EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
|
|
348
|
+
return pfrexp_generic(a, exponent);
|
|
349
|
+
}
|
|
302
350
|
|
|
303
|
-
|
|
351
|
+
template <>
|
|
352
|
+
EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
|
|
353
|
+
return vec_splats(from);
|
|
354
|
+
}
|
|
355
|
+
template <>
|
|
356
|
+
EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
|
357
|
+
return vec_splats(from);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
template <>
|
|
361
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
|
|
362
|
+
a3 = pload<Packet4i>(a);
|
|
363
|
+
a0 = vec_splat(a3, 0);
|
|
364
|
+
a1 = vec_splat(a3, 1);
|
|
365
|
+
a2 = vec_splat(a3, 2);
|
|
366
|
+
a3 = vec_splat(a3, 3);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
template <>
|
|
370
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
|
|
371
|
+
Packet2d& a3) {
|
|
372
|
+
a1 = pload<Packet2d>(a);
|
|
373
|
+
a0 = vec_splat(a1, 0);
|
|
374
|
+
a1 = vec_splat(a1, 1);
|
|
375
|
+
a3 = pload<Packet2d>(a + 2);
|
|
376
|
+
a2 = vec_splat(a3, 0);
|
|
377
|
+
a3 = vec_splat(a3, 1);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
template <>
|
|
381
|
+
EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
|
|
382
|
+
EIGEN_ALIGN16 int ai[4];
|
|
383
|
+
ai[0] = from[0 * stride];
|
|
384
|
+
ai[1] = from[1 * stride];
|
|
385
|
+
ai[2] = from[2 * stride];
|
|
386
|
+
ai[3] = from[3 * stride];
|
|
387
|
+
return pload<Packet4i>(ai);
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
template <>
|
|
391
|
+
EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
|
|
392
|
+
EIGEN_ALIGN16 double af[2];
|
|
393
|
+
af[0] = from[0 * stride];
|
|
394
|
+
af[1] = from[1 * stride];
|
|
395
|
+
return pload<Packet2d>(af);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
template <>
|
|
399
|
+
EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
|
|
400
|
+
EIGEN_ALIGN16 int ai[4];
|
|
401
|
+
pstore<int>((int*)ai, from);
|
|
402
|
+
to[0 * stride] = ai[0];
|
|
403
|
+
to[1 * stride] = ai[1];
|
|
404
|
+
to[2 * stride] = ai[2];
|
|
405
|
+
to[3 * stride] = ai[3];
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
template <>
|
|
409
|
+
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
|
|
410
|
+
EIGEN_ALIGN16 double af[2];
|
|
411
|
+
pstore<double>(af, from);
|
|
412
|
+
to[0 * stride] = af[0];
|
|
413
|
+
to[1 * stride] = af[1];
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
template <>
|
|
417
|
+
EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
418
|
+
return (a + b);
|
|
419
|
+
}
|
|
420
|
+
template <>
|
|
421
|
+
EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
422
|
+
return (a + b);
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
template <>
|
|
426
|
+
EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
427
|
+
return (a - b);
|
|
428
|
+
}
|
|
429
|
+
template <>
|
|
430
|
+
EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
431
|
+
return (a - b);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
template <>
|
|
435
|
+
EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
436
|
+
return (a * b);
|
|
437
|
+
}
|
|
438
|
+
template <>
|
|
439
|
+
EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
440
|
+
return (a * b);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
template <>
|
|
444
|
+
EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
445
|
+
return (a / b);
|
|
446
|
+
}
|
|
447
|
+
template <>
|
|
448
|
+
EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
449
|
+
return (a / b);
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
template <>
|
|
453
|
+
EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
|
|
454
|
+
return (-a);
|
|
455
|
+
}
|
|
456
|
+
template <>
|
|
457
|
+
EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
|
|
458
|
+
return (-a);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
template <>
|
|
462
|
+
EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
|
|
463
|
+
return a;
|
|
464
|
+
}
|
|
465
|
+
template <>
|
|
466
|
+
EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
|
|
467
|
+
return a;
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
template <>
|
|
471
|
+
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
|
472
|
+
return padd<Packet4i>(pmul<Packet4i>(a, b), c);
|
|
473
|
+
}
|
|
474
|
+
template <>
|
|
475
|
+
EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
|
|
476
|
+
return vec_madd(a, b, c);
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
template <>
|
|
480
|
+
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
|
|
481
|
+
return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
|
|
482
|
+
}
|
|
483
|
+
template <>
|
|
484
|
+
EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
|
|
485
|
+
return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
template <>
|
|
489
|
+
EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
490
|
+
return vec_min(a, b);
|
|
491
|
+
}
|
|
492
|
+
template <>
|
|
493
|
+
EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
494
|
+
return vec_min(a, b);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
template <>
|
|
498
|
+
EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
499
|
+
return vec_max(a, b);
|
|
500
|
+
}
|
|
501
|
+
template <>
|
|
502
|
+
EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
503
|
+
return vec_max(a, b);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
template <>
|
|
507
|
+
EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
508
|
+
return vec_and(a, b);
|
|
509
|
+
}
|
|
510
|
+
template <>
|
|
511
|
+
EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
512
|
+
return vec_and(a, b);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
template <>
|
|
516
|
+
EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
517
|
+
return vec_or(a, b);
|
|
518
|
+
}
|
|
519
|
+
template <>
|
|
520
|
+
EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
521
|
+
return vec_or(a, b);
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
template <>
|
|
525
|
+
EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
526
|
+
return vec_xor(a, b);
|
|
527
|
+
}
|
|
528
|
+
template <>
|
|
529
|
+
EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
530
|
+
return vec_xor(a, b);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
template <>
|
|
534
|
+
EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
|
535
|
+
return pand<Packet4i>(a, vec_nor(b, b));
|
|
536
|
+
}
|
|
537
|
+
template <>
|
|
538
|
+
EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
|
|
539
|
+
return vec_and(a, vec_nor(b, b));
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
template <>
|
|
543
|
+
EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
|
|
544
|
+
/* Uses non-default rounding for vec_round */
|
|
545
|
+
return __builtin_s390_vfidb(a, 0, 1);
|
|
546
|
+
}
|
|
547
|
+
template <>
|
|
548
|
+
EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
|
|
549
|
+
return vec_ceil(a);
|
|
550
|
+
}
|
|
551
|
+
template <>
|
|
552
|
+
EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
|
|
553
|
+
return vec_floor(a);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
template <>
|
|
557
|
+
EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
|
|
558
|
+
return pload<Packet4i>(from);
|
|
559
|
+
}
|
|
560
|
+
template <>
|
|
561
|
+
EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
|
|
562
|
+
return pload<Packet2d>(from);
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
template <>
|
|
566
|
+
EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
|
|
567
|
+
Packet4i p = pload<Packet4i>(from);
|
|
568
|
+
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
template <>
|
|
572
|
+
EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
|
|
573
|
+
Packet2d p = pload<Packet2d>(from);
|
|
574
|
+
return vec_perm(p, p, p16uc_PSET64_HI);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
template <>
|
|
578
|
+
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
|
|
579
|
+
pstore<int>(to, from);
|
|
580
|
+
}
|
|
581
|
+
template <>
|
|
582
|
+
EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
|
|
583
|
+
pstore<double>(to, from);
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
template <>
|
|
587
|
+
EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
|
|
588
|
+
EIGEN_ZVECTOR_PREFETCH(addr);
|
|
589
|
+
}
|
|
590
|
+
template <>
|
|
591
|
+
EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
|
|
592
|
+
EIGEN_ZVECTOR_PREFETCH(addr);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
template <int N>
|
|
596
|
+
EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
|
|
597
|
+
return Packet2l { parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]) };
|
|
598
|
+
}
|
|
599
|
+
template <int N>
|
|
600
|
+
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
|
|
601
|
+
return Packet4i {
|
|
602
|
+
parithmetic_shift_right<N>(a[0]),
|
|
603
|
+
parithmetic_shift_right<N>(a[1]),
|
|
604
|
+
parithmetic_shift_right<N>(a[2]),
|
|
605
|
+
parithmetic_shift_right<N>(a[3]) };
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
template <int N>
|
|
609
|
+
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
|
610
|
+
return Packet2l { plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]) };
|
|
611
|
+
}
|
|
612
|
+
template <int N>
|
|
613
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
|
|
614
|
+
return Packet4i {
|
|
615
|
+
plogical_shift_right<N>(a[0]),
|
|
616
|
+
plogical_shift_right<N>(a[1]),
|
|
617
|
+
plogical_shift_right<N>(a[2]),
|
|
618
|
+
plogical_shift_right<N>(a[3]) };
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
template <int N>
|
|
622
|
+
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
|
|
623
|
+
return Packet2l { plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]) };
|
|
624
|
+
}
|
|
625
|
+
template <int N>
|
|
626
|
+
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
|
|
627
|
+
return Packet4i {
|
|
628
|
+
plogical_shift_left<N>(a[0]),
|
|
629
|
+
plogical_shift_left<N>(a[1]),
|
|
630
|
+
plogical_shift_left<N>(a[2]),
|
|
631
|
+
plogical_shift_left<N>(a[3]) };
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
template <>
|
|
635
|
+
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
|
636
|
+
EIGEN_ALIGN16 int x[4];
|
|
637
|
+
pstore(x, a);
|
|
638
|
+
return x[0];
|
|
639
|
+
}
|
|
640
|
+
template <>
|
|
641
|
+
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
|
642
|
+
EIGEN_ALIGN16 double x[2];
|
|
643
|
+
pstore(x, a);
|
|
644
|
+
return x[0];
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
template <>
|
|
648
|
+
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
|
|
649
|
+
return reinterpret_cast<Packet4i>(
|
|
650
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
template <>
|
|
654
|
+
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
|
|
655
|
+
return reinterpret_cast<Packet2d>(
|
|
656
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
template <>
|
|
660
|
+
EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
|
|
661
|
+
return vec_abs(a);
|
|
662
|
+
}
|
|
663
|
+
template <>
|
|
664
|
+
EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
|
|
665
|
+
return vec_abs(a);
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
template <>
|
|
669
|
+
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
|
|
670
|
+
Packet4i b, sum;
|
|
671
|
+
b = vec_sld(a, a, 8);
|
|
672
|
+
sum = padd<Packet4i>(a, b);
|
|
673
|
+
b = vec_sld(sum, sum, 4);
|
|
674
|
+
sum = padd<Packet4i>(sum, b);
|
|
675
|
+
return pfirst(sum);
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
template <>
|
|
679
|
+
EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
|
|
680
|
+
Packet2d b, sum;
|
|
681
|
+
b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
|
|
682
|
+
sum = padd<Packet2d>(a, b);
|
|
683
|
+
return pfirst(sum);
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// Other reduction functions:
|
|
687
|
+
// mul
|
|
688
|
+
template <>
|
|
689
|
+
EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
|
|
690
|
+
EIGEN_ALIGN16 int aux[4];
|
|
691
|
+
pstore(aux, a);
|
|
692
|
+
return aux[0] * aux[1] * aux[2] * aux[3];
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
template <>
|
|
696
|
+
EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
|
|
697
|
+
return pfirst(
|
|
698
|
+
pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// min
|
|
702
|
+
template <>
|
|
703
|
+
EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
|
|
704
|
+
Packet4i b, res;
|
|
705
|
+
b = pmin<Packet4i>(a, vec_sld(a, a, 8));
|
|
706
|
+
res = pmin<Packet4i>(b, vec_sld(b, b, 4));
|
|
707
|
+
return pfirst(res);
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
template <>
|
|
711
|
+
EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
|
|
712
|
+
return pfirst(pmin<Packet2d>(
|
|
713
|
+
a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
// max
|
|
717
|
+
template <>
|
|
718
|
+
EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
|
|
719
|
+
Packet4i b, res;
|
|
720
|
+
b = pmax<Packet4i>(a, vec_sld(a, a, 8));
|
|
721
|
+
res = pmax<Packet4i>(b, vec_sld(b, b, 4));
|
|
722
|
+
return pfirst(res);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// max
|
|
726
|
+
template <>
|
|
727
|
+
EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
|
|
728
|
+
return pfirst(pmax<Packet2d>(
|
|
729
|
+
a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
|
|
733
|
+
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
734
|
+
Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
735
|
+
Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
|
736
|
+
Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
|
737
|
+
kernel.packet[0] = vec_mergeh(t0, t2);
|
|
738
|
+
kernel.packet[1] = vec_mergel(t0, t2);
|
|
739
|
+
kernel.packet[2] = vec_mergeh(t1, t3);
|
|
740
|
+
kernel.packet[3] = vec_mergel(t1, t3);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
|
|
744
|
+
Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
|
|
745
|
+
Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
|
|
746
|
+
kernel.packet[0] = t0;
|
|
747
|
+
kernel.packet[1] = t1;
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
template <>
|
|
751
|
+
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
|
|
752
|
+
const Packet4i& elsePacket) {
|
|
753
|
+
Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
|
|
754
|
+
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
|
755
|
+
return vec_sel(elsePacket, thenPacket, mask);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
template <>
|
|
759
|
+
EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
|
|
760
|
+
const Packet2d& elsePacket) {
|
|
761
|
+
Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
|
|
762
|
+
Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
|
|
763
|
+
return vec_sel(elsePacket, thenPacket, mask);
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
/* z13 has no vector float support so we emulate that with double
|
|
767
|
+
z14 has proper vector float support.
|
|
768
|
+
*/
|
|
769
|
+
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
|
|
770
|
+
/* Helper function to simulate a vec_splat_packet4f
|
|
304
771
|
*/
|
|
305
|
-
template<int
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
772
|
+
template <int element>
|
|
773
|
+
EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
|
|
774
|
+
Packet4f splat;
|
|
775
|
+
switch (element) {
|
|
776
|
+
case 0:
|
|
777
|
+
splat.v4f[0] = vec_splat(from.v4f[0], 0);
|
|
778
|
+
splat.v4f[1] = splat.v4f[0];
|
|
779
|
+
break;
|
|
311
780
|
case 1:
|
|
312
|
-
|
|
313
|
-
|
|
781
|
+
splat.v4f[0] = vec_splat(from.v4f[0], 1);
|
|
782
|
+
splat.v4f[1] = splat.v4f[0];
|
|
314
783
|
break;
|
|
315
784
|
case 2:
|
|
316
|
-
|
|
317
|
-
|
|
785
|
+
splat.v4f[0] = vec_splat(from.v4f[1], 0);
|
|
786
|
+
splat.v4f[1] = splat.v4f[0];
|
|
318
787
|
break;
|
|
319
788
|
case 3:
|
|
320
|
-
|
|
321
|
-
|
|
789
|
+
splat.v4f[0] = vec_splat(from.v4f[1], 1);
|
|
790
|
+
splat.v4f[1] = splat.v4f[0];
|
|
322
791
|
break;
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
};
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
template<int Offset>
|
|
329
|
-
struct palign_impl<Offset,Packet2d>
|
|
330
|
-
{
|
|
331
|
-
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
|
|
332
|
-
{
|
|
333
|
-
if (Offset == 1)
|
|
334
|
-
first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
|
|
335
792
|
}
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
|
|
339
|
-
{
|
|
340
|
-
// FIXME: No intrinsic yet
|
|
341
|
-
EIGEN_DEBUG_ALIGNED_LOAD
|
|
342
|
-
Packet *vfrom;
|
|
343
|
-
vfrom = (Packet *) from;
|
|
344
|
-
return vfrom->v4i;
|
|
793
|
+
return splat;
|
|
345
794
|
}
|
|
346
795
|
|
|
347
|
-
template<>
|
|
348
|
-
{
|
|
796
|
+
template <>
|
|
797
|
+
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
|
|
349
798
|
// FIXME: No intrinsic yet
|
|
350
799
|
EIGEN_DEBUG_ALIGNED_LOAD
|
|
351
800
|
Packet4f vfrom;
|
|
@@ -354,72 +803,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
|
|
354
803
|
return vfrom;
|
|
355
804
|
}
|
|
356
805
|
|
|
357
|
-
template<>
|
|
358
|
-
{
|
|
359
|
-
// FIXME: No intrinsic yet
|
|
360
|
-
EIGEN_DEBUG_ALIGNED_LOAD
|
|
361
|
-
Packet *vfrom;
|
|
362
|
-
vfrom = (Packet *) from;
|
|
363
|
-
return vfrom->v2d;
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
|
|
367
|
-
{
|
|
368
|
-
// FIXME: No intrinsic yet
|
|
369
|
-
EIGEN_DEBUG_ALIGNED_STORE
|
|
370
|
-
Packet *vto;
|
|
371
|
-
vto = (Packet *) to;
|
|
372
|
-
vto->v4i = from;
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
|
|
376
|
-
{
|
|
806
|
+
template <>
|
|
807
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
|
|
377
808
|
// FIXME: No intrinsic yet
|
|
378
809
|
EIGEN_DEBUG_ALIGNED_STORE
|
|
379
810
|
vec_st2f(from.v4f[0], &to[0]);
|
|
380
811
|
vec_st2f(from.v4f[1], &to[2]);
|
|
381
812
|
}
|
|
382
813
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
{
|
|
386
|
-
// FIXME: No intrinsic yet
|
|
387
|
-
EIGEN_DEBUG_ALIGNED_STORE
|
|
388
|
-
Packet *vto;
|
|
389
|
-
vto = (Packet *) to;
|
|
390
|
-
vto->v2d = from;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
|
|
394
|
-
{
|
|
395
|
-
return vec_splats(from);
|
|
396
|
-
}
|
|
397
|
-
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
|
398
|
-
return vec_splats(from);
|
|
399
|
-
}
|
|
400
|
-
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
|
|
401
|
-
{
|
|
814
|
+
template <>
|
|
815
|
+
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
|
402
816
|
Packet4f to;
|
|
403
817
|
to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
|
|
404
818
|
to.v4f[1] = to.v4f[0];
|
|
405
819
|
return to;
|
|
406
820
|
}
|
|
407
821
|
|
|
408
|
-
template<>
|
|
409
|
-
pbroadcast4<
|
|
410
|
-
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
|
|
411
|
-
{
|
|
412
|
-
a3 = pload<Packet4i>(a);
|
|
413
|
-
a0 = vec_splat(a3, 0);
|
|
414
|
-
a1 = vec_splat(a3, 1);
|
|
415
|
-
a2 = vec_splat(a3, 2);
|
|
416
|
-
a3 = vec_splat(a3, 3);
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
template<> EIGEN_STRONG_INLINE void
|
|
420
|
-
pbroadcast4<Packet4f>(const float *a,
|
|
421
|
-
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
|
422
|
-
{
|
|
822
|
+
template <>
|
|
823
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
|
|
423
824
|
a3 = pload<Packet4f>(a);
|
|
424
825
|
a0 = vec_splat_packet4f<0>(a3);
|
|
425
826
|
a1 = vec_splat_packet4f<1>(a3);
|
|
@@ -427,461 +828,213 @@ pbroadcast4<Packet4f>(const float *a,
|
|
|
427
828
|
a3 = vec_splat_packet4f<3>(a3);
|
|
428
829
|
}
|
|
429
830
|
|
|
430
|
-
template<>
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
a2 = vec_splat(a3, 0);
|
|
439
|
-
a3 = vec_splat(a3, 1);
|
|
831
|
+
template <>
|
|
832
|
+
EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
|
833
|
+
EIGEN_ALIGN16 float ai[4];
|
|
834
|
+
ai[0] = from[0 * stride];
|
|
835
|
+
ai[1] = from[1 * stride];
|
|
836
|
+
ai[2] = from[2 * stride];
|
|
837
|
+
ai[3] = from[3 * stride];
|
|
838
|
+
return pload<Packet4f>(ai);
|
|
440
839
|
}
|
|
441
840
|
|
|
442
|
-
template<>
|
|
443
|
-
{
|
|
444
|
-
|
|
445
|
-
ai
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
|
453
|
-
{
|
|
454
|
-
float EIGEN_ALIGN16 ai[4];
|
|
455
|
-
ai[0] = from[0*stride];
|
|
456
|
-
ai[1] = from[1*stride];
|
|
457
|
-
ai[2] = from[2*stride];
|
|
458
|
-
ai[3] = from[3*stride];
|
|
459
|
-
return pload<Packet4f>(ai);
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
|
|
463
|
-
{
|
|
464
|
-
double EIGEN_ALIGN16 af[2];
|
|
465
|
-
af[0] = from[0*stride];
|
|
466
|
-
af[1] = from[1*stride];
|
|
467
|
-
return pload<Packet2d>(af);
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
|
|
471
|
-
{
|
|
472
|
-
int EIGEN_ALIGN16 ai[4];
|
|
473
|
-
pstore<int>((int *)ai, from);
|
|
474
|
-
to[0*stride] = ai[0];
|
|
475
|
-
to[1*stride] = ai[1];
|
|
476
|
-
to[2*stride] = ai[2];
|
|
477
|
-
to[3*stride] = ai[3];
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
|
|
481
|
-
{
|
|
482
|
-
float EIGEN_ALIGN16 ai[4];
|
|
483
|
-
pstore<float>((float *)ai, from);
|
|
484
|
-
to[0*stride] = ai[0];
|
|
485
|
-
to[1*stride] = ai[1];
|
|
486
|
-
to[2*stride] = ai[2];
|
|
487
|
-
to[3*stride] = ai[3];
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
|
|
491
|
-
{
|
|
492
|
-
double EIGEN_ALIGN16 af[2];
|
|
493
|
-
pstore<double>(af, from);
|
|
494
|
-
to[0*stride] = af[0];
|
|
495
|
-
to[1*stride] = af[1];
|
|
841
|
+
template <>
|
|
842
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
|
843
|
+
EIGEN_ALIGN16 float ai[4];
|
|
844
|
+
pstore<float>((float*)ai, from);
|
|
845
|
+
to[0 * stride] = ai[0];
|
|
846
|
+
to[1 * stride] = ai[1];
|
|
847
|
+
to[2 * stride] = ai[2];
|
|
848
|
+
to[3 * stride] = ai[3];
|
|
496
849
|
}
|
|
497
850
|
|
|
498
|
-
template<>
|
|
499
|
-
|
|
500
|
-
{
|
|
851
|
+
template <>
|
|
852
|
+
EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
501
853
|
Packet4f c;
|
|
502
854
|
c.v4f[0] = a.v4f[0] + b.v4f[0];
|
|
503
855
|
c.v4f[1] = a.v4f[1] + b.v4f[1];
|
|
504
856
|
return c;
|
|
505
857
|
}
|
|
506
|
-
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
|
|
507
858
|
|
|
508
|
-
template<>
|
|
509
|
-
|
|
510
|
-
{
|
|
859
|
+
template <>
|
|
860
|
+
EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
511
861
|
Packet4f c;
|
|
512
862
|
c.v4f[0] = a.v4f[0] - b.v4f[0];
|
|
513
863
|
c.v4f[1] = a.v4f[1] - b.v4f[1];
|
|
514
864
|
return c;
|
|
515
865
|
}
|
|
516
|
-
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
|
|
517
866
|
|
|
518
|
-
template<>
|
|
519
|
-
|
|
520
|
-
{
|
|
867
|
+
template <>
|
|
868
|
+
EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
521
869
|
Packet4f c;
|
|
522
870
|
c.v4f[0] = a.v4f[0] * b.v4f[0];
|
|
523
871
|
c.v4f[1] = a.v4f[1] * b.v4f[1];
|
|
524
872
|
return c;
|
|
525
873
|
}
|
|
526
|
-
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
|
|
527
874
|
|
|
528
|
-
template<>
|
|
529
|
-
|
|
530
|
-
{
|
|
875
|
+
template <>
|
|
876
|
+
EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
531
877
|
Packet4f c;
|
|
532
878
|
c.v4f[0] = a.v4f[0] / b.v4f[0];
|
|
533
879
|
c.v4f[1] = a.v4f[1] / b.v4f[1];
|
|
534
880
|
return c;
|
|
535
881
|
}
|
|
536
|
-
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
|
|
537
882
|
|
|
538
|
-
template<>
|
|
539
|
-
|
|
540
|
-
{
|
|
883
|
+
template <>
|
|
884
|
+
EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
|
|
541
885
|
Packet4f c;
|
|
542
886
|
c.v4f[0] = -a.v4f[0];
|
|
543
887
|
c.v4f[1] = -a.v4f[1];
|
|
544
888
|
return c;
|
|
545
889
|
}
|
|
546
|
-
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
|
|
547
890
|
|
|
548
|
-
template<>
|
|
549
|
-
|
|
550
|
-
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
|
|
551
|
-
|
|
552
|
-
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
|
|
553
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
|
554
|
-
{
|
|
891
|
+
template <>
|
|
892
|
+
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
555
893
|
Packet4f res;
|
|
556
894
|
res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
|
|
557
895
|
res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
|
|
558
896
|
return res;
|
|
559
897
|
}
|
|
560
|
-
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
|
|
561
|
-
|
|
562
|
-
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
|
|
563
|
-
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
|
|
564
|
-
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
|
|
565
898
|
|
|
566
|
-
template<>
|
|
567
|
-
|
|
568
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
569
|
-
{
|
|
899
|
+
template <>
|
|
900
|
+
EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
570
901
|
Packet4f res;
|
|
571
902
|
res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
|
|
572
903
|
res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
|
|
573
904
|
return res;
|
|
574
905
|
}
|
|
575
906
|
|
|
576
|
-
template<>
|
|
577
|
-
|
|
578
|
-
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
579
|
-
{
|
|
907
|
+
template <>
|
|
908
|
+
EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
580
909
|
Packet4f res;
|
|
581
910
|
res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
|
|
582
911
|
res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
|
|
583
912
|
return res;
|
|
584
913
|
}
|
|
585
914
|
|
|
586
|
-
template<>
|
|
587
|
-
|
|
588
|
-
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
589
|
-
{
|
|
915
|
+
template <>
|
|
916
|
+
EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
590
917
|
Packet4f res;
|
|
591
918
|
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
|
|
592
919
|
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
|
|
593
920
|
return res;
|
|
594
921
|
}
|
|
595
922
|
|
|
596
|
-
template<>
|
|
597
|
-
|
|
598
|
-
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
599
|
-
{
|
|
923
|
+
template <>
|
|
924
|
+
EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
600
925
|
Packet4f res;
|
|
601
|
-
res.v4f[0] =
|
|
602
|
-
res.v4f[1] =
|
|
926
|
+
res.v4f[0] = por(a.v4f[0], b.v4f[0]);
|
|
927
|
+
res.v4f[1] = por(a.v4f[1], b.v4f[1]);
|
|
603
928
|
return res;
|
|
604
929
|
}
|
|
605
930
|
|
|
606
|
-
template<>
|
|
607
|
-
|
|
608
|
-
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
609
|
-
{
|
|
931
|
+
template <>
|
|
932
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
610
933
|
Packet4f res;
|
|
611
|
-
res.v4f[0] =
|
|
612
|
-
res.v4f[1] =
|
|
934
|
+
res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
|
|
935
|
+
res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
|
|
613
936
|
return res;
|
|
614
937
|
}
|
|
615
938
|
|
|
616
|
-
template<>
|
|
617
|
-
|
|
618
|
-
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
619
|
-
{
|
|
939
|
+
template <>
|
|
940
|
+
EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
620
941
|
Packet4f res;
|
|
621
942
|
res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
|
|
622
943
|
res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
|
|
623
944
|
return res;
|
|
624
945
|
}
|
|
625
946
|
|
|
626
|
-
template<>
|
|
627
|
-
{
|
|
947
|
+
template <>
|
|
948
|
+
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
|
|
628
949
|
Packet4f res;
|
|
629
|
-
res.v4f[0] =
|
|
630
|
-
res.v4f[1] =
|
|
950
|
+
res.v4f[0] = generic_round(a.v4f[0]);
|
|
951
|
+
res.v4f[1] = generic_round(a.v4f[1]);
|
|
631
952
|
return res;
|
|
632
953
|
}
|
|
633
|
-
|
|
634
|
-
template<>
|
|
635
|
-
{
|
|
954
|
+
|
|
955
|
+
template <>
|
|
956
|
+
EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
|
|
636
957
|
Packet4f res;
|
|
637
958
|
res.v4f[0] = vec_ceil(a.v4f[0]);
|
|
638
959
|
res.v4f[1] = vec_ceil(a.v4f[1]);
|
|
639
960
|
return res;
|
|
640
961
|
}
|
|
641
|
-
|
|
642
|
-
template<>
|
|
643
|
-
{
|
|
962
|
+
|
|
963
|
+
template <>
|
|
964
|
+
EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
|
|
644
965
|
Packet4f res;
|
|
645
966
|
res.v4f[0] = vec_floor(a.v4f[0]);
|
|
646
967
|
res.v4f[1] = vec_floor(a.v4f[1]);
|
|
647
968
|
return res;
|
|
648
969
|
}
|
|
649
|
-
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
|
|
650
|
-
|
|
651
|
-
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
|
|
652
|
-
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); }
|
|
653
|
-
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
|
657
|
-
{
|
|
658
|
-
Packet4i p = pload<Packet4i>(from);
|
|
659
|
-
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
|
660
|
-
}
|
|
661
970
|
|
|
662
|
-
template<>
|
|
663
|
-
{
|
|
971
|
+
template <>
|
|
972
|
+
EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
|
|
664
973
|
Packet4f p = pload<Packet4f>(from);
|
|
665
974
|
p.v4f[1] = vec_splat(p.v4f[0], 1);
|
|
666
975
|
p.v4f[0] = vec_splat(p.v4f[0], 0);
|
|
667
976
|
return p;
|
|
668
977
|
}
|
|
669
978
|
|
|
670
|
-
template<>
|
|
671
|
-
{
|
|
672
|
-
|
|
673
|
-
|
|
979
|
+
template <>
|
|
980
|
+
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
|
981
|
+
EIGEN_ALIGN16 float x[2];
|
|
982
|
+
vec_st2f(a.v4f[0], &x[0]);
|
|
983
|
+
return x[0];
|
|
674
984
|
}
|
|
675
985
|
|
|
676
|
-
template<>
|
|
677
|
-
|
|
678
|
-
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
|
|
679
|
-
|
|
680
|
-
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
|
681
|
-
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
|
682
|
-
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
|
|
683
|
-
|
|
684
|
-
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
|
|
685
|
-
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
|
|
686
|
-
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
|
|
687
|
-
|
|
688
|
-
template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
|
|
689
|
-
{
|
|
690
|
-
return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
691
|
-
}
|
|
692
|
-
|
|
693
|
-
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
|
|
694
|
-
{
|
|
695
|
-
return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
|
|
699
|
-
{
|
|
986
|
+
template <>
|
|
987
|
+
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
|
700
988
|
Packet4f rev;
|
|
701
989
|
rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
|
|
702
990
|
rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
|
|
703
991
|
return rev;
|
|
704
992
|
}
|
|
705
993
|
|
|
706
|
-
template<>
|
|
707
|
-
|
|
708
|
-
template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
|
|
709
|
-
{
|
|
994
|
+
template <>
|
|
995
|
+
EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
|
|
710
996
|
Packet4f res;
|
|
711
997
|
res.v4f[0] = pabs(a.v4f[0]);
|
|
712
998
|
res.v4f[1] = pabs(a.v4f[1]);
|
|
713
999
|
return res;
|
|
714
1000
|
}
|
|
715
1001
|
|
|
716
|
-
template<>
|
|
717
|
-
{
|
|
718
|
-
Packet4i b, sum;
|
|
719
|
-
b = vec_sld(a, a, 8);
|
|
720
|
-
sum = padd<Packet4i>(a, b);
|
|
721
|
-
b = vec_sld(sum, sum, 4);
|
|
722
|
-
sum = padd<Packet4i>(sum, b);
|
|
723
|
-
return pfirst(sum);
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
|
|
727
|
-
{
|
|
728
|
-
Packet2d b, sum;
|
|
729
|
-
b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
|
|
730
|
-
sum = padd<Packet2d>(a, b);
|
|
731
|
-
return pfirst(sum);
|
|
732
|
-
}
|
|
733
|
-
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
|
|
734
|
-
{
|
|
1002
|
+
template <>
|
|
1003
|
+
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
|
|
735
1004
|
Packet2d sum;
|
|
736
1005
|
sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
|
|
737
1006
|
double first = predux<Packet2d>(sum);
|
|
738
1007
|
return static_cast<float>(first);
|
|
739
1008
|
}
|
|
740
1009
|
|
|
741
|
-
template<>
|
|
742
|
-
{
|
|
743
|
-
Packet4i v[4], sum[4];
|
|
744
|
-
|
|
745
|
-
// It's easier and faster to transpose then add as columns
|
|
746
|
-
// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
|
|
747
|
-
// Do the transpose, first set of moves
|
|
748
|
-
v[0] = vec_mergeh(vecs[0], vecs[2]);
|
|
749
|
-
v[1] = vec_mergel(vecs[0], vecs[2]);
|
|
750
|
-
v[2] = vec_mergeh(vecs[1], vecs[3]);
|
|
751
|
-
v[3] = vec_mergel(vecs[1], vecs[3]);
|
|
752
|
-
// Get the resulting vectors
|
|
753
|
-
sum[0] = vec_mergeh(v[0], v[2]);
|
|
754
|
-
sum[1] = vec_mergel(v[0], v[2]);
|
|
755
|
-
sum[2] = vec_mergeh(v[1], v[3]);
|
|
756
|
-
sum[3] = vec_mergel(v[1], v[3]);
|
|
757
|
-
|
|
758
|
-
// Now do the summation:
|
|
759
|
-
// Lines 0+1
|
|
760
|
-
sum[0] = padd<Packet4i>(sum[0], sum[1]);
|
|
761
|
-
// Lines 2+3
|
|
762
|
-
sum[1] = padd<Packet4i>(sum[2], sum[3]);
|
|
763
|
-
// Add the results
|
|
764
|
-
sum[0] = padd<Packet4i>(sum[0], sum[1]);
|
|
765
|
-
|
|
766
|
-
return sum[0];
|
|
767
|
-
}
|
|
768
|
-
|
|
769
|
-
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
|
770
|
-
{
|
|
771
|
-
Packet2d v[2], sum;
|
|
772
|
-
v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
|
|
773
|
-
v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
|
|
774
|
-
|
|
775
|
-
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
|
|
776
|
-
|
|
777
|
-
return sum;
|
|
778
|
-
}
|
|
779
|
-
|
|
780
|
-
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
|
|
781
|
-
{
|
|
782
|
-
PacketBlock<Packet4f,4> transpose;
|
|
783
|
-
transpose.packet[0] = vecs[0];
|
|
784
|
-
transpose.packet[1] = vecs[1];
|
|
785
|
-
transpose.packet[2] = vecs[2];
|
|
786
|
-
transpose.packet[3] = vecs[3];
|
|
787
|
-
ptranspose(transpose);
|
|
788
|
-
|
|
789
|
-
Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
|
|
790
|
-
sum = padd(sum, transpose.packet[2]);
|
|
791
|
-
sum = padd(sum, transpose.packet[3]);
|
|
792
|
-
return sum;
|
|
793
|
-
}
|
|
794
|
-
|
|
795
|
-
// Other reduction functions:
|
|
796
|
-
// mul
|
|
797
|
-
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
|
798
|
-
{
|
|
799
|
-
EIGEN_ALIGN16 int aux[4];
|
|
800
|
-
pstore(aux, a);
|
|
801
|
-
return aux[0] * aux[1] * aux[2] * aux[3];
|
|
802
|
-
}
|
|
803
|
-
|
|
804
|
-
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
|
|
805
|
-
{
|
|
806
|
-
return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
807
|
-
}
|
|
808
|
-
|
|
809
|
-
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
|
810
|
-
{
|
|
1010
|
+
template <>
|
|
1011
|
+
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
|
|
811
1012
|
// Return predux_mul<Packet2d> of the subvectors product
|
|
812
1013
|
return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
|
|
813
1014
|
}
|
|
814
1015
|
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
{
|
|
818
|
-
Packet4i b, res;
|
|
819
|
-
b = pmin<Packet4i>(a, vec_sld(a, a, 8));
|
|
820
|
-
res = pmin<Packet4i>(b, vec_sld(b, b, 4));
|
|
821
|
-
return pfirst(res);
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
|
|
825
|
-
{
|
|
826
|
-
return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
827
|
-
}
|
|
828
|
-
|
|
829
|
-
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
|
830
|
-
{
|
|
1016
|
+
template <>
|
|
1017
|
+
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
|
|
831
1018
|
Packet2d b, res;
|
|
832
|
-
b
|
|
833
|
-
res = pmin<Packet2d>(
|
|
1019
|
+
b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
|
|
1020
|
+
res = pmin<Packet2d>(
|
|
1021
|
+
b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
|
834
1022
|
return static_cast<float>(pfirst(res));
|
|
835
1023
|
}
|
|
836
1024
|
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
{
|
|
840
|
-
Packet4i b, res;
|
|
841
|
-
b = pmax<Packet4i>(a, vec_sld(a, a, 8));
|
|
842
|
-
res = pmax<Packet4i>(b, vec_sld(b, b, 4));
|
|
843
|
-
return pfirst(res);
|
|
844
|
-
}
|
|
845
|
-
|
|
846
|
-
// max
|
|
847
|
-
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
|
848
|
-
{
|
|
849
|
-
return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
|
853
|
-
{
|
|
1025
|
+
template <>
|
|
1026
|
+
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
|
|
854
1027
|
Packet2d b, res;
|
|
855
|
-
b
|
|
856
|
-
res = pmax<Packet2d>(
|
|
1028
|
+
b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
|
|
1029
|
+
res = pmax<Packet2d>(
|
|
1030
|
+
b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
|
|
857
1031
|
return static_cast<float>(pfirst(res));
|
|
858
1032
|
}
|
|
859
1033
|
|
|
860
|
-
EIGEN_DEVICE_FUNC inline void
|
|
861
|
-
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
|
862
|
-
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
863
|
-
Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
864
|
-
Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
|
865
|
-
Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
|
866
|
-
kernel.packet[0] = vec_mergeh(t0, t2);
|
|
867
|
-
kernel.packet[1] = vec_mergel(t0, t2);
|
|
868
|
-
kernel.packet[2] = vec_mergeh(t1, t3);
|
|
869
|
-
kernel.packet[3] = vec_mergel(t1, t3);
|
|
870
|
-
}
|
|
871
|
-
|
|
872
|
-
EIGEN_DEVICE_FUNC inline void
|
|
873
|
-
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
|
874
|
-
Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
|
|
875
|
-
Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
|
|
876
|
-
kernel.packet[0] = t0;
|
|
877
|
-
kernel.packet[1] = t1;
|
|
878
|
-
}
|
|
879
|
-
|
|
880
1034
|
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
|
|
881
1035
|
*/
|
|
882
|
-
EIGEN_DEVICE_FUNC inline void
|
|
883
|
-
|
|
884
|
-
PacketBlock<Packet2d,2> t0,t1,t2,t3;
|
|
1036
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
|
1037
|
+
PacketBlock<Packet2d, 2> t0, t1, t2, t3;
|
|
885
1038
|
// copy top-left 2x2 Packet2d block
|
|
886
1039
|
t0.packet[0] = kernel.packet[0].v4f[0];
|
|
887
1040
|
t0.packet[1] = kernel.packet[1].v4f[0];
|
|
@@ -915,15 +1068,11 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
|
|
915
1068
|
kernel.packet[3].v4f[1] = t3.packet[1];
|
|
916
1069
|
}
|
|
917
1070
|
|
|
918
|
-
template<>
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
|
|
925
|
-
Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
|
|
926
|
-
Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
|
|
1071
|
+
template <>
|
|
1072
|
+
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
|
|
1073
|
+
const Packet4f& elsePacket) {
|
|
1074
|
+
Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
|
|
1075
|
+
Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
|
|
927
1076
|
Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
|
|
928
1077
|
Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
|
|
929
1078
|
Packet4f result;
|
|
@@ -932,14 +1081,333 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
|
|
|
932
1081
|
return result;
|
|
933
1082
|
}
|
|
934
1083
|
|
|
935
|
-
template<>
|
|
936
|
-
|
|
937
|
-
|
|
1084
|
+
template <>
|
|
1085
|
+
Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1086
|
+
Packet4f res;
|
|
1087
|
+
res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
|
|
1088
|
+
res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
|
|
1089
|
+
return res;
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
template <>
|
|
1093
|
+
Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1094
|
+
Packet4f res;
|
|
1095
|
+
res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
|
|
1096
|
+
res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
|
|
1097
|
+
return res;
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
template <>
|
|
1101
|
+
Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1102
|
+
Packet4f res;
|
|
1103
|
+
res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
|
|
1104
|
+
res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
|
|
1105
|
+
return res;
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
#else
|
|
1109
|
+
template <>
|
|
1110
|
+
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
|
|
1111
|
+
EIGEN_DEBUG_ALIGNED_LOAD
|
|
1112
|
+
return vec_xl(0, from);
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
template <>
|
|
1116
|
+
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
|
|
1117
|
+
EIGEN_DEBUG_ALIGNED_STORE
|
|
1118
|
+
vec_xst(from, 0, to);
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
template <>
|
|
1122
|
+
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
|
1123
|
+
return vec_splats(from);
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
template <>
|
|
1127
|
+
EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
|
|
1128
|
+
a3 = pload<Packet4f>(a);
|
|
1129
|
+
a0 = vec_splat(a3, 0);
|
|
1130
|
+
a1 = vec_splat(a3, 1);
|
|
1131
|
+
a2 = vec_splat(a3, 2);
|
|
1132
|
+
a3 = vec_splat(a3, 3);
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
template <>
|
|
1136
|
+
EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
|
1137
|
+
EIGEN_ALIGN16 float af[4];
|
|
1138
|
+
af[0] = from[0 * stride];
|
|
1139
|
+
af[1] = from[1 * stride];
|
|
1140
|
+
af[2] = from[2 * stride];
|
|
1141
|
+
af[3] = from[3 * stride];
|
|
1142
|
+
return pload<Packet4f>(af);
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
template <>
|
|
1146
|
+
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
|
1147
|
+
EIGEN_ALIGN16 float af[4];
|
|
1148
|
+
pstore<float>((float*)af, from);
|
|
1149
|
+
to[0 * stride] = af[0];
|
|
1150
|
+
to[1 * stride] = af[1];
|
|
1151
|
+
to[2 * stride] = af[2];
|
|
1152
|
+
to[3 * stride] = af[3];
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
template <>
|
|
1156
|
+
EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1157
|
+
return (a + b);
|
|
1158
|
+
}
|
|
1159
|
+
template <>
|
|
1160
|
+
EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1161
|
+
return (a - b);
|
|
1162
|
+
}
|
|
1163
|
+
template <>
|
|
1164
|
+
EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1165
|
+
return (a * b);
|
|
1166
|
+
}
|
|
1167
|
+
template <>
|
|
1168
|
+
EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1169
|
+
return (a / b);
|
|
1170
|
+
}
|
|
1171
|
+
template <>
|
|
1172
|
+
EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
|
|
1173
|
+
return (-a);
|
|
1174
|
+
}
|
|
1175
|
+
template <>
|
|
1176
|
+
EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
|
|
1177
|
+
return a;
|
|
1178
|
+
}
|
|
1179
|
+
template <>
|
|
1180
|
+
EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
|
1181
|
+
return vec_madd(a, b, c);
|
|
1182
|
+
}
|
|
1183
|
+
template <>
|
|
1184
|
+
EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1185
|
+
return vec_min(a, b);
|
|
1186
|
+
}
|
|
1187
|
+
template <>
|
|
1188
|
+
EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1189
|
+
return vec_max(a, b);
|
|
1190
|
+
}
|
|
1191
|
+
template <>
|
|
1192
|
+
EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1193
|
+
return vec_and(a, b);
|
|
1194
|
+
}
|
|
1195
|
+
template <>
|
|
1196
|
+
EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1197
|
+
return vec_or(a, b);
|
|
1198
|
+
}
|
|
1199
|
+
template <>
|
|
1200
|
+
EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1201
|
+
return vec_xor(a, b);
|
|
1202
|
+
}
|
|
1203
|
+
template <>
|
|
1204
|
+
EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
|
1205
|
+
return vec_and(a, vec_nor(b, b));
|
|
1206
|
+
}
|
|
1207
|
+
template <>
|
|
1208
|
+
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
|
|
1209
|
+
/* Uses non-default rounding for vec_round */
|
|
1210
|
+
return __builtin_s390_vfisb(a, 0, 1);
|
|
1211
|
+
}
|
|
1212
|
+
template <>
|
|
1213
|
+
EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
|
|
1214
|
+
return vec_ceil(a);
|
|
1215
|
+
}
|
|
1216
|
+
template <>
|
|
1217
|
+
EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
|
|
1218
|
+
return vec_floor(a);
|
|
1219
|
+
}
|
|
1220
|
+
template <>
|
|
1221
|
+
EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
|
|
1222
|
+
return vec_abs(a);
|
|
1223
|
+
}
|
|
1224
|
+
template <>
|
|
1225
|
+
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
|
1226
|
+
EIGEN_ALIGN16 float x[4];
|
|
1227
|
+
pstore(x, a);
|
|
1228
|
+
return x[0];
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
template <>
|
|
1232
|
+
EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
|
|
1233
|
+
Packet4f p = pload<Packet4f>(from);
|
|
1234
|
+
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
template <>
|
|
1238
|
+
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
|
1239
|
+
return reinterpret_cast<Packet4f>(
|
|
1240
|
+
vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
template <>
|
|
1244
|
+
EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
|
|
1245
|
+
Packet4f b, sum;
|
|
1246
|
+
b = vec_sld(a, a, 8);
|
|
1247
|
+
sum = padd<Packet4f>(a, b);
|
|
1248
|
+
b = vec_sld(sum, sum, 4);
|
|
1249
|
+
sum = padd<Packet4f>(sum, b);
|
|
1250
|
+
return pfirst(sum);
|
|
1251
|
+
}
|
|
1252
|
+
|
|
1253
|
+
// Other reduction functions:
|
|
1254
|
+
// mul
|
|
1255
|
+
template <>
|
|
1256
|
+
EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
|
|
1257
|
+
Packet4f prod;
|
|
1258
|
+
prod = pmul(a, vec_sld(a, a, 8));
|
|
1259
|
+
return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
// min
|
|
1263
|
+
template <>
|
|
1264
|
+
EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
|
|
1265
|
+
Packet4f b, res;
|
|
1266
|
+
b = pmin<Packet4f>(a, vec_sld(a, a, 8));
|
|
1267
|
+
res = pmin<Packet4f>(b, vec_sld(b, b, 4));
|
|
1268
|
+
return pfirst(res);
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
// max
|
|
1272
|
+
template <>
|
|
1273
|
+
EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
|
|
1274
|
+
Packet4f b, res;
|
|
1275
|
+
b = pmax<Packet4f>(a, vec_sld(a, a, 8));
|
|
1276
|
+
res = pmax<Packet4f>(b, vec_sld(b, b, 4));
|
|
1277
|
+
return pfirst(res);
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
|
1281
|
+
Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
|
1282
|
+
Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
|
1283
|
+
Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
|
1284
|
+
Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
|
1285
|
+
kernel.packet[0] = vec_mergeh(t0, t2);
|
|
1286
|
+
kernel.packet[1] = vec_mergel(t0, t2);
|
|
1287
|
+
kernel.packet[2] = vec_mergeh(t1, t3);
|
|
1288
|
+
kernel.packet[3] = vec_mergel(t1, t3);
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
template <>
|
|
1292
|
+
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
|
|
1293
|
+
const Packet4f& elsePacket) {
|
|
1294
|
+
Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
|
|
1295
|
+
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
|
|
938
1296
|
return vec_sel(elsePacket, thenPacket, mask);
|
|
939
1297
|
}
|
|
940
1298
|
|
|
941
|
-
|
|
1299
|
+
#endif
|
|
1300
|
+
|
|
1301
|
+
template <>
|
|
1302
|
+
EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
|
|
1303
|
+
return pldexp_generic(a, exponent);
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
template <>
|
|
1307
|
+
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
|
|
1308
|
+
// Clamp exponent to [-2099, 2099]
|
|
1309
|
+
const Packet2d max_exponent = pset1<Packet2d>(2099.0);
|
|
1310
|
+
const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
|
|
1311
|
+
|
|
1312
|
+
// Split 2^e into four factors and multiply:
|
|
1313
|
+
const Packet2l bias = {1023, 1023};
|
|
1314
|
+
Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
|
|
1315
|
+
Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
|
|
1316
|
+
Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
|
|
1317
|
+
b = psub(psub(psub(e, b), b), b); // e - 3b
|
|
1318
|
+
c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
|
|
1319
|
+
out = pmul(out, c); // a * 2^e
|
|
1320
|
+
return out;
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
template <>
|
|
1324
|
+
EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
|
|
1325
|
+
EIGEN_ZVECTOR_PREFETCH(addr);
|
|
1326
|
+
}
|
|
1327
|
+
template <>
|
|
1328
|
+
EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
|
1329
|
+
return pload<Packet4f>(from);
|
|
1330
|
+
}
|
|
1331
|
+
template <>
|
|
1332
|
+
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
|
|
1333
|
+
pstore<float>(to, from);
|
|
1334
|
+
}
|
|
1335
|
+
template <>
|
|
1336
|
+
EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
|
|
1337
|
+
return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
#if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
|
|
1341
|
+
#pragma GCC warning \
|
|
1342
|
+
"float->int and int->float conversion is simulated. compile for z15 for improved performance"
|
|
1343
|
+
template <>
|
|
1344
|
+
struct cast_impl<Packet4i, Packet4f> {
|
|
1345
|
+
EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
|
|
1346
|
+
return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3]) };
|
|
1347
|
+
}
|
|
1348
|
+
};
|
|
1349
|
+
|
|
1350
|
+
template <>
|
|
1351
|
+
struct cast_impl<Packet4f, Packet4i> {
|
|
1352
|
+
EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
|
|
1353
|
+
return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3]) };
|
|
1354
|
+
}
|
|
1355
|
+
};
|
|
1356
|
+
|
|
1357
|
+
template <>
|
|
1358
|
+
struct cast_impl<Packet2l, Packet2d> {
|
|
1359
|
+
EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
|
|
1360
|
+
return Packet2d{double(a[0]), double(a[1]) };
|
|
1361
|
+
}
|
|
1362
|
+
};
|
|
1363
|
+
|
|
1364
|
+
template <>
|
|
1365
|
+
struct cast_impl<Packet2d, Packet2l> {
|
|
1366
|
+
EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
|
|
1367
|
+
return Packet2l{(long long)(a[0]), (long long)(a[1]) };
|
|
1368
|
+
}
|
|
1369
|
+
};
|
|
1370
|
+
#else
|
|
1371
|
+
template <>
|
|
1372
|
+
struct cast_impl<Packet4i, Packet4f> {
|
|
1373
|
+
EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
|
|
1374
|
+
return vec_float(a);
|
|
1375
|
+
}
|
|
1376
|
+
};
|
|
1377
|
+
|
|
1378
|
+
template <>
|
|
1379
|
+
struct cast_impl<Packet4f, Packet4i> {
|
|
1380
|
+
EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
|
|
1381
|
+
return vec_signed(a);
|
|
1382
|
+
}
|
|
1383
|
+
};
|
|
1384
|
+
|
|
1385
|
+
template <>
|
|
1386
|
+
struct cast_impl<Packet2l, Packet2d> {
|
|
1387
|
+
EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
|
|
1388
|
+
return vec_double(a);
|
|
1389
|
+
}
|
|
1390
|
+
};
|
|
1391
|
+
|
|
1392
|
+
template <>
|
|
1393
|
+
struct cast_impl<Packet2d, Packet2l> {
|
|
1394
|
+
EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
|
|
1395
|
+
return vec_signed(a);
|
|
1396
|
+
}
|
|
1397
|
+
};
|
|
1398
|
+
#endif
|
|
1399
|
+
|
|
1400
|
+
template <>
|
|
1401
|
+
EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
|
|
1402
|
+
return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
|
|
1403
|
+
}
|
|
1404
|
+
template <>
|
|
1405
|
+
EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
|
|
1406
|
+
return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
} // end namespace internal
|
|
942
1410
|
|
|
943
|
-
}
|
|
1411
|
+
} // end namespace Eigen
|
|
944
1412
|
|
|
945
|
-
#endif
|
|
1413
|
+
#endif // EIGEN_PACKET_MATH_ZVECTOR_H
|