@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -10,6 +10,9 @@
10
10
  #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11
11
  #define EIGEN_PACKET_MATH_ALTIVEC_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
15
+
13
16
  namespace Eigen {
14
17
 
15
18
  namespace internal {
@@ -22,135 +25,209 @@ namespace internal {
22
25
  #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
23
26
  #endif
24
27
 
25
- #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
26
- #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
27
- #endif
28
-
29
28
  // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
30
29
  #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
31
- #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
32
31
  #endif
33
32
 
34
- typedef __vector float Packet4f;
35
- typedef __vector int Packet4i;
36
- typedef __vector unsigned int Packet4ui;
37
- typedef __vector __bool int Packet4bi;
38
- typedef __vector short int Packet8i;
39
- typedef __vector unsigned char Packet16uc;
33
+ typedef __vector float Packet4f;
34
+ typedef __vector int Packet4i;
35
+ typedef __vector unsigned int Packet4ui;
36
+ typedef __vector __bool int Packet4bi;
37
+ typedef __vector short int Packet8s;
38
+ typedef __vector unsigned short int Packet8us;
39
+ typedef __vector __bool short Packet8bi;
40
+ typedef __vector signed char Packet16c;
41
+ typedef __vector unsigned char Packet16uc;
42
+ typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
40
43
 
41
44
  // We don't want to write the same code all the time, but we need to reuse the constants
42
45
  // and it doesn't really work to declare them global, so we define macros instead
46
+ #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
47
+
48
+ #define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
43
49
 
44
- #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
45
- Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
50
+ #define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
46
51
 
47
- #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
48
- Packet4i p4i_##NAME = vec_splat_s32(X)
52
+ #define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
49
53
 
50
- #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
51
- Packet4f p4f_##NAME = pset1<Packet4f>(X)
54
+ #define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55
+ Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
52
56
 
53
- #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
54
- Packet4i p4i_##NAME = pset1<Packet4i>(X)
57
+ #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
55
58
 
56
- #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
57
- Packet2d p2d_##NAME = pset1<Packet2d>(X)
59
+ #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
58
60
 
59
- #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
60
- Packet2l p2l_##NAME = pset1<Packet2l>(X)
61
+ #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
61
62
 
62
- #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
63
+ #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
64
+
65
+ #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
63
66
  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
64
67
 
65
68
  #define DST_CHAN 1
66
69
  #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
67
-
70
+ #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
68
71
 
69
72
  // These constants are endian-agnostic
70
- static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
71
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
72
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
73
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
74
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
75
- static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
73
+ static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
74
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
75
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
76
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
77
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
78
+ static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79
+ static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80
+ static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
81
+ static Packet4f p4f_MZERO =
82
+ (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
76
83
  #ifndef __VSX__
77
- static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
84
+ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
78
85
  #endif
79
86
 
80
- static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
81
- static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
87
+ static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88
+ static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89
+ static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90
+ static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
82
91
 
83
- static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
84
- static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
92
+ static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93
+ static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
85
94
 
86
- // Mask alignment
87
- #ifdef __PPC64__
88
- #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
89
- #else
90
- #define _EIGEN_MASK_ALIGNMENT 0xfffffff0
95
+ static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96
+ static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97
+ static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
98
+
99
+ #ifdef _BIG_ENDIAN
100
+ static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
91
101
  #endif
102
+ static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103
+ static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
104
+
105
+ static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106
+ static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
92
107
 
93
- #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
108
+ static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109
+ static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
110
+ #ifdef _BIG_ENDIAN
111
+ static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
112
+ #else
113
+ static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
114
+ #endif
94
115
 
95
116
  // Handle endianness properly while loading constants
96
117
  // Define global static constants:
97
118
  #ifdef _BIG_ENDIAN
98
119
  static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
99
- #ifdef __VSX__
100
- static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
101
- #endif
102
- static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
103
- static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
104
- static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
120
+ static Packet16uc p16uc_PSET32_WODD =
121
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
122
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
123
+ static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
124
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
125
+ static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
126
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
105
127
  #else
106
128
  static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
107
- static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
108
- static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
109
- static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
110
- static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
111
- #endif // _BIG_ENDIAN
112
-
113
- static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
114
- static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
115
- static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
116
- static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
117
-
118
- static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
129
+ static Packet16uc p16uc_PSET32_WODD =
130
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
132
+ static Packet16uc p16uc_PSET32_WEVEN =
133
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
134
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
135
+ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
136
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
137
+ #endif // _BIG_ENDIAN
138
+
139
+ static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
141
+ static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
143
+ static Packet16uc p16uc_TRANSPOSE64_HI =
144
+ p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
145
+ static Packet16uc p16uc_TRANSPOSE64_LO =
146
+ p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
147
+
148
+ static Packet16uc p16uc_COMPLEX32_REV =
149
+ vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
119
150
 
120
- #ifdef _BIG_ENDIAN
121
- static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
151
+ #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152
+ #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
122
153
  #else
123
- static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
124
- #endif // _BIG_ENDIAN
154
+ #define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
155
+ #endif
125
156
 
126
- #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
127
- #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
157
+ #if EIGEN_COMP_LLVM
158
+ #define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
128
159
  #else
129
- #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
160
+ #define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
130
161
  #endif
131
162
 
132
- template<> struct packet_traits<float> : default_packet_traits
133
- {
163
+ template <>
164
+ struct packet_traits<float> : default_packet_traits {
134
165
  typedef Packet4f type;
135
166
  typedef Packet4f half;
136
167
  enum {
137
168
  Vectorizable = 1,
138
169
  AlignedOnScalar = 1,
139
- size=4,
140
- HasHalfPacket = 1,
141
-
142
- HasAdd = 1,
143
- HasSub = 1,
144
- HasMul = 1,
145
- HasDiv = 1,
146
- HasMin = 1,
147
- HasMax = 1,
148
- HasAbs = 1,
149
- HasSin = 0,
150
- HasCos = 0,
151
- HasLog = 0,
152
- HasExp = 1,
153
- #ifdef __VSX__
170
+ size = 4,
171
+
172
+ HasAdd = 1,
173
+ HasSub = 1,
174
+ HasMul = 1,
175
+ HasDiv = 1,
176
+ HasMin = 1,
177
+ HasMax = 1,
178
+ HasAbs = 1,
179
+ HasSin = EIGEN_FAST_MATH,
180
+ HasCos = EIGEN_FAST_MATH,
181
+ HasACos = 1,
182
+ HasASin = 1,
183
+ HasATan = 1,
184
+ HasATanh = 1,
185
+ HasLog = 1,
186
+ HasExp = 1,
187
+ #ifdef EIGEN_VECTORIZE_VSX
188
+ HasCmp = 1,
189
+ HasPow = 1,
190
+ HasSqrt = 1,
191
+ HasCbrt = 1,
192
+ #if !EIGEN_COMP_CLANG
193
+ HasRsqrt = 1,
194
+ #else
195
+ HasRsqrt = 0,
196
+ #endif
197
+ HasTanh = EIGEN_FAST_MATH,
198
+ HasErf = EIGEN_FAST_MATH,
199
+ HasErfc = EIGEN_FAST_MATH,
200
+ #else
201
+ HasSqrt = 0,
202
+ HasRsqrt = 0,
203
+ HasTanh = 0,
204
+ HasErf = 0,
205
+ #endif
206
+ HasNegate = 1,
207
+ HasBlend = 1
208
+ };
209
+ };
210
+ template <>
211
+ struct packet_traits<bfloat16> : default_packet_traits {
212
+ typedef Packet8bf type;
213
+ typedef Packet8bf half;
214
+ enum {
215
+ Vectorizable = 1,
216
+ AlignedOnScalar = 1,
217
+ size = 8,
218
+
219
+ HasAdd = 1,
220
+ HasSub = 1,
221
+ HasMul = 1,
222
+ HasDiv = 1,
223
+ HasMin = 1,
224
+ HasMax = 1,
225
+ HasAbs = 1,
226
+ HasSin = EIGEN_FAST_MATH,
227
+ HasCos = EIGEN_FAST_MATH,
228
+ HasLog = 1,
229
+ HasExp = 1,
230
+ #ifdef EIGEN_VECTORIZE_VSX
154
231
  HasSqrt = 1,
155
232
  #if !EIGEN_COMP_CLANG
156
233
  HasRsqrt = 1,
@@ -161,578 +238,2565 @@ template<> struct packet_traits<float> : default_packet_traits
161
238
  HasSqrt = 0,
162
239
  HasRsqrt = 0,
163
240
  #endif
164
- HasRound = 1,
165
- HasFloor = 1,
166
- HasCeil = 1,
241
+ HasTanh = 0,
242
+ HasErf = 0,
167
243
  HasNegate = 1,
168
244
  HasBlend = 1
169
245
  };
170
246
  };
171
- template<> struct packet_traits<int> : default_packet_traits
172
- {
247
+
248
+ template <>
249
+ struct packet_traits<int> : default_packet_traits {
173
250
  typedef Packet4i type;
174
251
  typedef Packet4i half;
175
252
  enum {
176
253
  Vectorizable = 1,
177
254
  AlignedOnScalar = 1,
178
255
  size = 4,
179
- HasHalfPacket = 0,
180
256
 
181
- HasAdd = 1,
182
- HasSub = 1,
183
- HasMul = 1,
184
- HasDiv = 0,
185
- HasBlend = 1
257
+ HasAdd = 1,
258
+ HasSub = 1,
259
+ HasShift = 1,
260
+ HasMul = 1,
261
+ #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
262
+ HasDiv = 1,
263
+ #else
264
+ HasDiv = 0,
265
+ #endif
266
+ HasBlend = 1,
267
+ HasCmp = 1
268
+ };
269
+ };
270
+
271
+ template <>
272
+ struct packet_traits<short int> : default_packet_traits {
273
+ typedef Packet8s type;
274
+ typedef Packet8s half;
275
+ enum {
276
+ Vectorizable = 1,
277
+ AlignedOnScalar = 1,
278
+ size = 8,
279
+
280
+ HasAdd = 1,
281
+ HasSub = 1,
282
+ HasMul = 1,
283
+ HasDiv = 0,
284
+ HasBlend = 1,
285
+ HasCmp = 1
186
286
  };
187
287
  };
188
288
 
289
+ template <>
290
+ struct packet_traits<unsigned short int> : default_packet_traits {
291
+ typedef Packet8us type;
292
+ typedef Packet8us half;
293
+ enum {
294
+ Vectorizable = 1,
295
+ AlignedOnScalar = 1,
296
+ size = 8,
297
+
298
+ HasAdd = 1,
299
+ HasSub = 1,
300
+ HasMul = 1,
301
+ HasDiv = 0,
302
+ HasBlend = 1,
303
+ HasCmp = 1
304
+ };
305
+ };
189
306
 
190
- template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
191
- template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
307
+ template <>
308
+ struct packet_traits<signed char> : default_packet_traits {
309
+ typedef Packet16c type;
310
+ typedef Packet16c half;
311
+ enum {
312
+ Vectorizable = 1,
313
+ AlignedOnScalar = 1,
314
+ size = 16,
315
+
316
+ HasAdd = 1,
317
+ HasSub = 1,
318
+ HasMul = 1,
319
+ HasDiv = 0,
320
+ HasBlend = 1,
321
+ HasCmp = 1
322
+ };
323
+ };
192
324
 
193
- inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
194
- {
195
- union {
196
- Packet16uc v;
197
- unsigned char n[16];
198
- } vt;
199
- vt.v = v;
200
- for (int i=0; i< 16; i++)
201
- s << (int)vt.n[i] << ", ";
202
- return s;
325
+ template <>
326
+ struct packet_traits<unsigned char> : default_packet_traits {
327
+ typedef Packet16uc type;
328
+ typedef Packet16uc half;
329
+ enum {
330
+ Vectorizable = 1,
331
+ AlignedOnScalar = 1,
332
+ size = 16,
333
+
334
+ HasAdd = 1,
335
+ HasSub = 1,
336
+ HasMul = 1,
337
+ HasDiv = 0,
338
+ HasBlend = 1,
339
+ HasCmp = 1
340
+ };
341
+ };
342
+
343
+ template <>
344
+ struct unpacket_traits<Packet4f> {
345
+ typedef float type;
346
+ typedef Packet4f half;
347
+ typedef Packet4i integer_packet;
348
+ enum {
349
+ size = 4,
350
+ alignment = Aligned16,
351
+ vectorizable = true,
352
+ masked_load_available = false,
353
+ masked_store_available = false
354
+ };
355
+ };
356
+ template <>
357
+ struct unpacket_traits<Packet4i> {
358
+ typedef int type;
359
+ typedef Packet4i half;
360
+ enum {
361
+ size = 4,
362
+ alignment = Aligned16,
363
+ vectorizable = true,
364
+ masked_load_available = false,
365
+ masked_store_available = false
366
+ };
367
+ };
368
+ template <>
369
+ struct unpacket_traits<Packet8s> {
370
+ typedef short int type;
371
+ typedef Packet8s half;
372
+ enum {
373
+ size = 8,
374
+ alignment = Aligned16,
375
+ vectorizable = true,
376
+ masked_load_available = false,
377
+ masked_store_available = false
378
+ };
379
+ };
380
+ template <>
381
+ struct unpacket_traits<Packet8us> {
382
+ typedef unsigned short int type;
383
+ typedef Packet8us half;
384
+ enum {
385
+ size = 8,
386
+ alignment = Aligned16,
387
+ vectorizable = true,
388
+ masked_load_available = false,
389
+ masked_store_available = false
390
+ };
391
+ };
392
+
393
+ template <>
394
+ struct unpacket_traits<Packet16c> {
395
+ typedef signed char type;
396
+ typedef Packet16c half;
397
+ enum {
398
+ size = 16,
399
+ alignment = Aligned16,
400
+ vectorizable = true,
401
+ masked_load_available = false,
402
+ masked_store_available = false
403
+ };
404
+ };
405
+ template <>
406
+ struct unpacket_traits<Packet16uc> {
407
+ typedef unsigned char type;
408
+ typedef Packet16uc half;
409
+ enum {
410
+ size = 16,
411
+ alignment = Aligned16,
412
+ vectorizable = true,
413
+ masked_load_available = false,
414
+ masked_store_available = false
415
+ };
416
+ };
417
+
418
+ template <>
419
+ struct unpacket_traits<Packet8bf> {
420
+ typedef bfloat16 type;
421
+ typedef Packet8bf half;
422
+ enum {
423
+ size = 8,
424
+ alignment = Aligned16,
425
+ vectorizable = true,
426
+ masked_load_available = false,
427
+ masked_store_available = false
428
+ };
429
+ };
430
+
431
+ template <typename Packet>
432
+ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
433
+ // some versions of GCC throw "unused-but-set-parameter".
434
+ // ignoring these warnings for now.
435
+ EIGEN_UNUSED_VARIABLE(from);
436
+ EIGEN_DEBUG_ALIGNED_LOAD
437
+ #ifdef EIGEN_VECTORIZE_VSX
438
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
439
+ #else
440
+ return vec_ld(0, from);
441
+ #endif
203
442
  }
204
443
 
205
- inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
206
- {
207
- union {
208
- Packet4f v;
209
- float n[4];
210
- } vt;
211
- vt.v = v;
212
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
213
- return s;
444
+ // Need to define them first or we get specialization after instantiation errors
445
+ template <>
446
+ EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
447
+ return pload_common<Packet4f>(from);
214
448
  }
215
449
 
216
- inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
217
- {
218
- union {
219
- Packet4i v;
220
- int n[4];
221
- } vt;
222
- vt.v = v;
223
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
224
- return s;
450
+ template <>
451
+ EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
452
+ return pload_common<Packet4i>(from);
225
453
  }
226
454
 
227
- inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
228
- {
229
- union {
230
- Packet4ui v;
231
- unsigned int n[4];
232
- } vt;
233
- vt.v = v;
234
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
235
- return s;
455
+ template <>
456
+ EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
457
+ return pload_common<Packet8s>(from);
236
458
  }
237
459
 
238
- // Need to define them first or we get specialization after instantiation errors
239
- template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
240
- {
460
+ template <>
461
+ EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
462
+ return pload_common<Packet8us>(from);
463
+ }
464
+
465
+ template <>
466
+ EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
467
+ return pload_common<Packet16c>(from);
468
+ }
469
+
470
+ template <>
471
+ EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
472
+ return pload_common<Packet16uc>(from);
473
+ }
474
+
475
+ template <>
476
+ EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
477
+ return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
478
+ }
479
+
480
+ template <typename Packet>
481
+ EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
482
+ // some versions of GCC throw "unused-but-set-parameter".
483
+ // ignoring these warnings for now.
484
+ EIGEN_UNUSED_VARIABLE(from);
241
485
  EIGEN_DEBUG_ALIGNED_LOAD
242
- #ifdef __VSX__
243
- return vec_vsx_ld(0, from);
486
+ // Ignore partial input memory initialized
487
+ #if !EIGEN_COMP_LLVM
488
+ #pragma GCC diagnostic push
489
+ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
490
+ #endif
491
+ #ifdef EIGEN_VECTORIZE_VSX
492
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
244
493
  #else
245
494
  return vec_ld(0, from);
246
495
  #endif
496
+ #if !EIGEN_COMP_LLVM
497
+ #pragma GCC diagnostic pop
498
+ #endif
499
+ }
500
+
501
+ template <>
502
+ EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
503
+ return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
247
504
  }
248
505
 
249
- template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
250
- {
506
+ template <typename Packet>
507
+ EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
508
+ const Index offset) {
509
+ // some versions of GCC throw "unused-but-set-parameter".
510
+ // ignoring these warnings for now.
511
+ const Index packet_size = unpacket_traits<Packet>::size;
512
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
513
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
514
+ #ifdef _ARCH_PWR9
515
+ EIGEN_UNUSED_VARIABLE(packet_size);
251
516
  EIGEN_DEBUG_ALIGNED_LOAD
252
- #ifdef __VSX__
253
- return vec_vsx_ld(0, from);
517
+ EIGEN_UNUSED_VARIABLE(from);
518
+ Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
519
+ if (offset) {
520
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
521
+ #ifdef _BIG_ENDIAN
522
+ load = Packet(vec_sro(Packet16uc(load), shift));
254
523
  #else
255
- return vec_ld(0, from);
524
+ load = Packet(vec_slo(Packet16uc(load), shift));
525
+ #endif
526
+ }
527
+ return load;
528
+ #else
529
+ if (n) {
530
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
531
+ unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
532
+ unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
533
+ Index n2 = n * size;
534
+ if (16 <= n2) {
535
+ pstoreu(load2, ploadu<Packet16uc>(from2));
536
+ } else {
537
+ memcpy((void*)load2, (void*)from2, n2);
538
+ }
539
+ return pload_ignore<Packet>(load);
540
+ } else {
541
+ return Packet(pset1<Packet16uc>(0));
542
+ }
256
543
  #endif
257
544
  }
258
545
 
259
- template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
260
- {
546
+ template <>
547
+ EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
548
+ return pload_partial_common<Packet4f>(from, n, offset);
549
+ }
550
+
551
+ template <>
552
+ EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
553
+ return pload_partial_common<Packet4i>(from, n, offset);
554
+ }
555
+
556
+ template <>
557
+ EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
558
+ return pload_partial_common<Packet8s>(from, n, offset);
559
+ }
560
+
561
+ template <>
562
+ EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
563
+ const Index offset) {
564
+ return pload_partial_common<Packet8us>(from, n, offset);
565
+ }
566
+
567
+ template <>
568
+ EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
569
+ return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
570
+ }
571
+
572
+ template <>
573
+ EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
574
+ return pload_partial_common<Packet16c>(from, n, offset);
575
+ }
576
+
577
+ template <>
578
+ EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
579
+ return pload_partial_common<Packet16uc>(from, n, offset);
580
+ }
581
+
582
+ template <typename Packet>
583
+ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
584
+ // some versions of GCC throw "unused-but-set-parameter" (float *to).
585
+ // ignoring these warnings for now.
586
+ EIGEN_UNUSED_VARIABLE(to);
261
587
  EIGEN_DEBUG_ALIGNED_STORE
262
- #ifdef __VSX__
263
- vec_vsx_st(from, 0, to);
588
+ #ifdef EIGEN_VECTORIZE_VSX
589
+ vec_xst(from, 0, to);
264
590
  #else
265
591
  vec_st(from, 0, to);
266
592
  #endif
267
593
  }
268
594
 
269
- template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
270
- {
595
+ template <>
596
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
597
+ pstore_common<Packet4f>(to, from);
598
+ }
599
+
600
+ template <>
601
+ EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
602
+ pstore_common<Packet4i>(to, from);
603
+ }
604
+
605
+ template <>
606
+ EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
607
+ pstore_common<Packet8s>(to, from);
608
+ }
609
+
610
+ template <>
611
+ EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
612
+ pstore_common<Packet8us>(to, from);
613
+ }
614
+
615
+ template <>
616
+ EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
617
+ pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
618
+ }
619
+
620
+ template <>
621
+ EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
622
+ pstore_common<Packet16c>(to, from);
623
+ }
624
+
625
+ template <>
626
+ EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
627
+ pstore_common<Packet16uc>(to, from);
628
+ }
629
+
630
+ template <typename Packet>
631
+ EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
632
+ const Index offset) {
633
+ // some versions of GCC throw "unused-but-set-parameter" (float *to).
634
+ // ignoring these warnings for now.
635
+ const Index packet_size = unpacket_traits<Packet>::size;
636
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
637
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
638
+ #ifdef _ARCH_PWR9
639
+ EIGEN_UNUSED_VARIABLE(packet_size);
640
+ EIGEN_UNUSED_VARIABLE(to);
271
641
  EIGEN_DEBUG_ALIGNED_STORE
272
- #ifdef __VSX__
273
- vec_vsx_st(from, 0, to);
642
+ Packet store = from;
643
+ if (offset) {
644
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
645
+ #ifdef _BIG_ENDIAN
646
+ store = Packet(vec_slo(Packet16uc(store), shift));
274
647
  #else
275
- vec_st(from, 0, to);
648
+ store = Packet(vec_sro(Packet16uc(store), shift));
649
+ #endif
650
+ }
651
+ vec_xst_len(store, to, n * size);
652
+ #else
653
+ if (n) {
654
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
655
+ pstore(store, from);
656
+ unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
657
+ unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
658
+ Index n2 = n * size;
659
+ if (16 <= n2) {
660
+ pstore(to2, ploadu<Packet16uc>(store2));
661
+ } else {
662
+ memcpy((void*)to2, (void*)store2, n2);
663
+ }
664
+ }
276
665
  #endif
277
666
  }
278
667
 
279
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
280
- Packet4f v = {from, from, from, from};
668
+ template <>
669
+ EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
670
+ pstore_partial_common<Packet4f>(to, from, n, offset);
671
+ }
672
+
673
+ template <>
674
+ EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
675
+ pstore_partial_common<Packet4i>(to, from, n, offset);
676
+ }
677
+
678
+ template <>
679
+ EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
680
+ const Index offset) {
681
+ pstore_partial_common<Packet8s>(to, from, n, offset);
682
+ }
683
+
684
+ template <>
685
+ EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
686
+ const Index n, const Index offset) {
687
+ pstore_partial_common<Packet8us>(to, from, n, offset);
688
+ }
689
+
690
+ template <>
691
+ EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
692
+ const Index offset) {
693
+ pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
694
+ }
695
+
696
+ template <>
697
+ EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
698
+ const Index offset) {
699
+ pstore_partial_common<Packet16c>(to, from, n, offset);
700
+ }
701
+
702
+ template <>
703
+ EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
704
+ const Index offset) {
705
+ pstore_partial_common<Packet16uc>(to, from, n, offset);
706
+ }
707
+
708
+ template <typename Packet>
709
+ EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
710
+ Packet v = {from, from, from, from};
281
711
  return v;
282
712
  }
283
713
 
284
- template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
285
- Packet4i v = {from, from, from, from};
714
+ template <typename Packet>
715
+ EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
716
+ Packet v = {from, from, from, from, from, from, from, from};
286
717
  return v;
287
718
  }
288
- template<> EIGEN_STRONG_INLINE void
289
- pbroadcast4<Packet4f>(const float *a,
290
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
291
- {
292
- a3 = pload<Packet4f>(a);
293
- a0 = vec_splat(a3, 0);
294
- a1 = vec_splat(a3, 1);
295
- a2 = vec_splat(a3, 2);
296
- a3 = vec_splat(a3, 3);
719
+
720
+ template <typename Packet>
721
+ EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
722
+ Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
723
+ return v;
724
+ }
725
+
726
+ template <>
727
+ EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
728
+ return pset1_size4<Packet4f>(from);
729
+ }
730
+
731
+ template <>
732
+ EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
733
+ return pset1_size4<Packet4i>(from);
734
+ }
735
+
736
+ template <>
737
+ EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
738
+ return pset1_size8<Packet8s>(from);
739
+ }
740
+
741
+ template <>
742
+ EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
743
+ return pset1_size8<Packet8us>(from);
744
+ }
745
+
746
+ template <>
747
+ EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
748
+ return pset1_size16<Packet16c>(from);
749
+ }
750
+
751
+ template <>
752
+ EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
753
+ return pset1_size16<Packet16uc>(from);
754
+ }
755
+
756
+ template <>
757
+ EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
758
+ return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
759
+ }
760
+
761
+ template <>
762
+ EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
763
+ return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
297
764
  }
298
- template<> EIGEN_STRONG_INLINE void
299
- pbroadcast4<Packet4i>(const int *a,
300
- Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
301
- {
302
- a3 = pload<Packet4i>(a);
765
+
766
+ template <typename Packet>
767
+ EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
768
+ Packet& a3) {
769
+ a3 = pload<Packet>(a);
303
770
  a0 = vec_splat(a3, 0);
304
771
  a1 = vec_splat(a3, 1);
305
772
  a2 = vec_splat(a3, 2);
306
773
  a3 = vec_splat(a3, 3);
307
774
  }
308
775
 
309
- template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
310
- {
311
- float EIGEN_ALIGN16 af[4];
312
- af[0] = from[0*stride];
313
- af[1] = from[1*stride];
314
- af[2] = from[2*stride];
315
- af[3] = from[3*stride];
316
- return pload<Packet4f>(af);
317
- }
318
- template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
319
- {
320
- int EIGEN_ALIGN16 ai[4];
321
- ai[0] = from[0*stride];
322
- ai[1] = from[1*stride];
323
- ai[2] = from[2*stride];
324
- ai[3] = from[3*stride];
325
- return pload<Packet4i>(ai);
326
- }
327
- template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
328
- {
329
- float EIGEN_ALIGN16 af[4];
330
- pstore<float>(af, from);
331
- to[0*stride] = af[0];
332
- to[1*stride] = af[1];
333
- to[2*stride] = af[2];
334
- to[3*stride] = af[3];
335
- }
336
- template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
337
- {
338
- int EIGEN_ALIGN16 ai[4];
339
- pstore<int>((int *)ai, from);
340
- to[0*stride] = ai[0];
341
- to[1*stride] = ai[1];
342
- to[2*stride] = ai[2];
343
- to[3*stride] = ai[3];
344
- }
345
-
346
- template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
347
- template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
348
-
349
- template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
350
- template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
351
-
352
- template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
353
- template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
354
-
355
- template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
356
- template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
357
-
358
- template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
359
- template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
360
-
361
- template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
362
- template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
363
-
364
- template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
365
- {
366
- #ifndef __VSX__ // VSX actually provides a div instruction
367
- Packet4f t, y_0, y_1;
776
+ template <>
777
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
778
+ pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
779
+ }
780
+ template <>
781
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
782
+ pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
783
+ }
368
784
 
369
- // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
370
- y_0 = vec_re(b);
785
+ template <typename Packet>
786
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
787
+ const Index n = unpacket_traits<Packet>::size) {
788
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
789
+ eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
790
+ if (stride == 1) {
791
+ if (n == unpacket_traits<Packet>::size) {
792
+ return ploadu<Packet>(from);
793
+ } else {
794
+ return ploadu_partial<Packet>(from, n);
795
+ }
796
+ } else {
797
+ LOAD_STORE_UNROLL_16
798
+ for (Index i = 0; i < n; i++) {
799
+ a[i] = from[i * stride];
800
+ }
801
+ // Leave rest of the array uninitialized
802
+ return pload_ignore<Packet>(a);
803
+ }
804
+ }
371
805
 
372
- // Do one Newton-Raphson iteration to get the needed accuracy
373
- t = vec_nmsub(y_0, b, p4f_ONE);
374
- y_1 = vec_madd(y_0, t, y_0);
806
+ template <>
807
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
808
+ return pgather_common<Packet4f>(from, stride);
809
+ }
375
810
 
376
- return vec_madd(a, y_1, p4f_MZERO);
377
- #else
378
- return vec_div(a, b);
379
- #endif
811
+ template <>
812
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
813
+ return pgather_common<Packet4i>(from, stride);
380
814
  }
381
815
 
382
- template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
383
- { eigen_assert(false && "packet integer division are not supported by AltiVec");
384
- return pset1<Packet4i>(0);
816
+ template <>
817
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
818
+ return pgather_common<Packet8s>(from, stride);
385
819
  }
386
820
 
387
- // for some weird raisons, it has to be overloaded for packet of integers
388
- template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
389
- template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
821
+ template <>
822
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
823
+ Index stride) {
824
+ return pgather_common<Packet8us>(from, stride);
825
+ }
390
826
 
391
- template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
392
- {
393
- #ifdef __VSX__
394
- Packet4f ret;
395
- __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
396
- return ret;
397
- #else
398
- return vec_min(a, b);
399
- #endif
827
+ template <>
828
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
829
+ return pgather_common<Packet8bf>(from, stride);
400
830
  }
401
- template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
402
831
 
403
- template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
404
- {
405
- #ifdef __VSX__
406
- Packet4f ret;
407
- __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
408
- return ret;
409
- #else
410
- return vec_max(a, b);
411
- #endif
832
+ template <>
833
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
834
+ return pgather_common<Packet16c>(from, stride);
412
835
  }
413
- template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
414
836
 
415
- template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
416
- template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
837
+ template <>
838
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
839
+ Index stride) {
840
+ return pgather_common<Packet16uc>(from, stride);
841
+ }
417
842
 
418
- template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
419
- template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
843
+ template <>
844
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
845
+ const Index n) {
846
+ return pgather_common<Packet4f>(from, stride, n);
847
+ }
420
848
 
421
- template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
422
- template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
849
+ template <>
850
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
851
+ const Index n) {
852
+ return pgather_common<Packet4i>(from, stride, n);
853
+ }
423
854
 
424
- template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
425
- template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
855
+ template <>
856
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
857
+ const Index n) {
858
+ return pgather_common<Packet8s>(from, stride, n);
859
+ }
426
860
 
427
- template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
428
- template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
429
- template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
861
+ template <>
862
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
863
+ pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
864
+ return pgather_common<Packet8us>(from, stride, n);
865
+ }
430
866
 
431
- #ifdef _BIG_ENDIAN
432
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
433
- {
434
- EIGEN_DEBUG_ALIGNED_LOAD
435
- Packet16uc MSQ, LSQ;
436
- Packet16uc mask;
437
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
438
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
439
- mask = vec_lvsl(0, from); // create the permute mask
440
- return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
867
+ template <>
868
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
869
+ const Index n) {
870
+ return pgather_common<Packet8bf>(from, stride, n);
871
+ }
441
872
 
873
+ template <>
874
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
875
+ Index stride, const Index n) {
876
+ return pgather_common<Packet16c>(from, stride, n);
442
877
  }
443
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
444
- {
445
- EIGEN_DEBUG_ALIGNED_LOAD
446
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
447
- Packet16uc MSQ, LSQ;
448
- Packet16uc mask;
449
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
450
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
451
- mask = vec_lvsl(0, from); // create the permute mask
452
- return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
878
+
879
+ template <>
880
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
881
+ Index stride,
882
+ const Index n) {
883
+ return pgather_common<Packet16uc>(from, stride, n);
453
884
  }
454
- #else
455
- // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
456
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
457
- {
458
- EIGEN_DEBUG_UNALIGNED_LOAD
459
- return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
885
+
886
+ template <typename Packet>
887
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
888
+ Index stride,
889
+ const Index n = unpacket_traits<Packet>::size) {
890
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
891
+ eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
892
+ if (stride == 1) {
893
+ if (n == unpacket_traits<Packet>::size) {
894
+ return pstoreu(to, from);
895
+ } else {
896
+ return pstoreu_partial(to, from, n);
897
+ }
898
+ } else {
899
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
900
+ LOAD_STORE_UNROLL_16
901
+ for (Index i = 0; i < n; i++) {
902
+ to[i * stride] = a[i];
903
+ }
904
+ }
460
905
  }
461
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
462
- {
463
- EIGEN_DEBUG_UNALIGNED_LOAD
464
- return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
906
+
907
+ template <>
908
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
909
+ pscatter_common<Packet4f>(to, from, stride);
465
910
  }
466
- #endif
467
911
 
468
- template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
469
- {
470
- Packet4f p;
471
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
472
- else p = ploadu<Packet4f>(from);
473
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
474
- }
475
- template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
476
- {
477
- Packet4i p;
478
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
479
- else p = ploadu<Packet4i>(from);
480
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
912
+ template <>
913
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
914
+ pscatter_common<Packet4i>(to, from, stride);
915
+ }
916
+
917
+ template <>
918
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
919
+ Index stride) {
920
+ pscatter_common<Packet8s>(to, from, stride);
921
+ }
922
+
923
+ template <>
924
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
925
+ const Packet8us& from,
926
+ Index stride) {
927
+ pscatter_common<Packet8us>(to, from, stride);
928
+ }
929
+
930
+ template <>
931
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
932
+ Index stride) {
933
+ pscatter_common<Packet8bf>(to, from, stride);
934
+ }
935
+
936
+ template <>
937
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
938
+ Index stride) {
939
+ pscatter_common<Packet16c>(to, from, stride);
940
+ }
941
+
942
+ template <>
943
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
944
+ const Packet16uc& from, Index stride) {
945
+ pscatter_common<Packet16uc>(to, from, stride);
946
+ }
947
+
948
+ template <>
949
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
950
+ Index stride, const Index n) {
951
+ pscatter_common<Packet4f>(to, from, stride, n);
952
+ }
953
+
954
+ template <>
955
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
956
+ const Index n) {
957
+ pscatter_common<Packet4i>(to, from, stride, n);
958
+ }
959
+
960
+ template <>
961
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
962
+ Index stride, const Index n) {
963
+ pscatter_common<Packet8s>(to, from, stride, n);
964
+ }
965
+
966
+ template <>
967
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
968
+ const Packet8us& from,
969
+ Index stride,
970
+ const Index n) {
971
+ pscatter_common<Packet8us>(to, from, stride, n);
972
+ }
973
+
974
+ template <>
975
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
976
+ Index stride, const Index n) {
977
+ pscatter_common<Packet8bf>(to, from, stride, n);
978
+ }
979
+
980
+ template <>
981
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
982
+ const Packet16c& from, Index stride,
983
+ const Index n) {
984
+ pscatter_common<Packet16c>(to, from, stride, n);
985
+ }
986
+
987
+ template <>
988
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
989
+ const Packet16uc& from,
990
+ Index stride, const Index n) {
991
+ pscatter_common<Packet16uc>(to, from, stride, n);
992
+ }
993
+
994
+ template <>
995
+ EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
996
+ return pset1<Packet4f>(a) + p4f_COUNTDOWN;
997
+ }
998
+ template <>
999
+ EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
1000
+ return pset1<Packet4i>(a) + p4i_COUNTDOWN;
1001
+ }
1002
+ template <>
1003
+ EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
1004
+ return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1005
+ }
1006
+ template <>
1007
+ EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
1008
+ return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1009
+ }
1010
+ template <>
1011
+ EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
1012
+ return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1013
+ }
1014
+ template <>
1015
+ EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
1016
+ return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1017
+ }
1018
+
1019
+ template <>
1020
+ EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1021
+ return a + b;
1022
+ }
1023
+ template <>
1024
+ EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
1025
+ return a + b;
1026
+ }
1027
+ template <>
1028
+ EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1029
+ return a + b;
1030
+ }
1031
+ template <>
1032
+ EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
1033
+ return a + b;
1034
+ }
1035
+ template <>
1036
+ EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
1037
+ return a + b;
1038
+ }
1039
+ template <>
1040
+ EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
1041
+ return a + b;
1042
+ }
1043
+ template <>
1044
+ EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1045
+ return a + b;
1046
+ }
1047
+
1048
+ template <>
1049
+ EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1050
+ return a - b;
1051
+ }
1052
+ template <>
1053
+ EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
1054
+ return a - b;
1055
+ }
1056
+ template <>
1057
+ EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
1058
+ return a - b;
1059
+ }
1060
+ template <>
1061
+ EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
1062
+ return a - b;
1063
+ }
1064
+ template <>
1065
+ EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
1066
+ return a - b;
1067
+ }
1068
+ template <>
1069
+ EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1070
+ return a - b;
1071
+ }
1072
+
1073
+ template <>
1074
+ EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
1075
+ #ifdef __POWER8_VECTOR__
1076
+ return vec_neg(a);
1077
+ #else
1078
+ return vec_xor(a, p4f_MZERO);
1079
+ #endif
1080
+ }
1081
+ template <>
1082
+ EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
1083
+ #ifdef __POWER8_VECTOR__
1084
+ return vec_neg(a);
1085
+ #else
1086
+ return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
1087
+ #endif
1088
+ }
1089
+ template <>
1090
+ EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
1091
+ #ifdef __POWER8_VECTOR__
1092
+ return vec_neg(a);
1093
+ #else
1094
+ return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
1095
+ #endif
1096
+ }
1097
+ template <>
1098
+ EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
1099
+ #ifdef __POWER8_VECTOR__
1100
+ return vec_neg(a);
1101
+ #else
1102
+ return p4i_ZERO - a;
1103
+ #endif
1104
+ }
1105
+
1106
+ template <>
1107
+ EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
1108
+ return a;
1109
+ }
1110
+ template <>
1111
+ EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
1112
+ return a;
1113
+ }
1114
+
1115
+ template <>
1116
+ EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1117
+ return vec_madd(a, b, p4f_MZERO);
1118
+ }
1119
+ template <>
1120
+ EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1121
+ return a * b;
1122
+ }
1123
+ template <>
1124
+ EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1125
+ return vec_mul(a, b);
1126
+ }
1127
+ template <>
1128
+ EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1129
+ return vec_mul(a, b);
1130
+ }
1131
+ template <>
1132
+ EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
1133
+ return vec_mul(a, b);
1134
+ }
1135
+ template <>
1136
+ EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1137
+ return vec_mul(a, b);
1138
+ }
1139
+
1140
+ template <>
1141
+ EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1142
+ #ifndef __VSX__ // VSX actually provides a div instruction
1143
+ Packet4f t, y_0, y_1;
1144
+
1145
+ // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
1146
+ y_0 = vec_re(b);
1147
+
1148
+ // Do one Newton-Raphson iteration to get the needed accuracy
1149
+ t = vec_nmsub(y_0, b, p4f_ONE);
1150
+ y_1 = vec_madd(y_0, t, y_0);
1151
+
1152
+ return vec_madd(a, y_1, p4f_MZERO);
1153
+ #else
1154
+ return vec_div(a, b);
1155
+ #endif
1156
+ }
1157
+
1158
+ template <>
1159
+ EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
1160
+ #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1161
+ return vec_div(a, b);
1162
+ #else
1163
+ EIGEN_UNUSED_VARIABLE(a);
1164
+ EIGEN_UNUSED_VARIABLE(b);
1165
+ eigen_assert(false && "packet integer division are not supported by AltiVec");
1166
+ return pset1<Packet4i>(0);
1167
+ #endif
1168
+ }
1169
+
1170
+ // for some weird raisons, it has to be overloaded for packet of integers
1171
+ template <>
1172
+ EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1173
+ return vec_madd(a, b, c);
1174
+ }
1175
+ template <>
1176
+ EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1177
+ return a * b + c;
1178
+ }
1179
+ template <>
1180
+ EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1181
+ return vec_madd(a, b, c);
1182
+ }
1183
+ template <>
1184
+ EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1185
+ return vec_madd(a, b, c);
1186
+ }
1187
+
1188
+ #ifdef EIGEN_VECTORIZE_VSX
1189
+ template <>
1190
+ EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1191
+ return vec_msub(a, b, c);
1192
+ }
1193
+ template <>
1194
+ EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1195
+ return vec_nmsub(a, b, c);
1196
+ }
1197
+ template <>
1198
+ EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1199
+ return vec_nmadd(a, b, c);
1200
+ }
1201
+ #endif
1202
+
1203
+ template <>
1204
+ EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1205
+ #ifdef EIGEN_VECTORIZE_VSX
1206
+ // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
1207
+ Packet4f ret;
1208
+ __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1209
+ return ret;
1210
+ #else
1211
+ return vec_min(a, b);
1212
+ #endif
1213
+ }
1214
+ template <>
1215
+ EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1216
+ return vec_min(a, b);
1217
+ }
1218
+ template <>
1219
+ EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1220
+ return vec_min(a, b);
1221
+ }
1222
+ template <>
1223
+ EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1224
+ return vec_min(a, b);
1225
+ }
1226
+ template <>
1227
+ EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1228
+ return vec_min(a, b);
1229
+ }
1230
+ template <>
1231
+ EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1232
+ return vec_min(a, b);
1233
+ }
1234
+
1235
+ template <>
1236
+ EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1237
+ #ifdef EIGEN_VECTORIZE_VSX
1238
+ // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
1239
+ Packet4f ret;
1240
+ __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1241
+ return ret;
1242
+ #else
1243
+ return vec_max(a, b);
1244
+ #endif
1245
+ }
1246
+ template <>
1247
+ EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1248
+ return vec_max(a, b);
1249
+ }
1250
+ template <>
1251
+ EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1252
+ return vec_max(a, b);
1253
+ }
1254
+ template <>
1255
+ EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1256
+ return vec_max(a, b);
1257
+ }
1258
+ template <>
1259
+ EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1260
+ return vec_max(a, b);
1261
+ }
1262
+ template <>
1263
+ EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1264
+ return vec_max(a, b);
1265
+ }
1266
+
1267
+ template <>
1268
+ EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
1269
+ return reinterpret_cast<Packet4f>(vec_cmple(a, b));
1270
+ }
1271
+ // To fix bug with vec_cmplt on older versions
1272
+ #ifdef EIGEN_VECTORIZE_VSX
1273
+ template <>
1274
+ EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
1275
+ return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
1276
+ }
1277
+ #endif
1278
+ template <>
1279
+ EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
1280
+ return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
1281
+ }
1282
+ template <>
1283
+ EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
1284
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
1285
+ return vec_nor(c, c);
1286
+ }
1287
+
1288
+ #ifdef EIGEN_VECTORIZE_VSX
1289
+ template <>
1290
+ EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
1291
+ return reinterpret_cast<Packet4i>(vec_cmple(a, b));
1292
+ }
1293
+ #endif
1294
+ template <>
1295
+ EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
1296
+ return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
1297
+ }
1298
+ template <>
1299
+ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
1300
+ return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
1301
+ }
1302
+ #ifdef EIGEN_VECTORIZE_VSX
1303
+ template <>
1304
+ EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
1305
+ return reinterpret_cast<Packet8s>(vec_cmple(a, b));
1306
+ }
1307
+ #endif
1308
+ template <>
1309
+ EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
1310
+ return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
1311
+ }
1312
+ template <>
1313
+ EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
1314
+ return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
1315
+ }
1316
+ #ifdef EIGEN_VECTORIZE_VSX
1317
+ template <>
1318
+ EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
1319
+ return reinterpret_cast<Packet8us>(vec_cmple(a, b));
1320
+ }
1321
+ #endif
1322
+ template <>
1323
+ EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
1324
+ return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
1325
+ }
1326
+ template <>
1327
+ EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
1328
+ return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
1329
+ }
1330
+ #ifdef EIGEN_VECTORIZE_VSX
1331
+ template <>
1332
+ EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
1333
+ return reinterpret_cast<Packet16c>(vec_cmple(a, b));
1334
+ }
1335
+ #endif
1336
+ template <>
1337
+ EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
1338
+ return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
1339
+ }
1340
+ template <>
1341
+ EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
1342
+ return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
1343
+ }
1344
+ #ifdef EIGEN_VECTORIZE_VSX
1345
+ template <>
1346
+ EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
1347
+ return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
1348
+ }
1349
+ #endif
1350
+ template <>
1351
+ EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
1352
+ return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
1353
+ }
1354
+ template <>
1355
+ EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
1356
+ return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
1357
+ }
1358
+
1359
+ template <>
1360
+ EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1361
+ return vec_and(a, b);
1362
+ }
1363
+ template <>
1364
+ EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1365
+ return vec_and(a, b);
1366
+ }
1367
+ template <>
1368
+ EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1369
+ return vec_and(a, b);
1370
+ }
1371
+ template <>
1372
+ EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1373
+ return vec_and(a, b);
1374
+ }
1375
+ template <>
1376
+ EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1377
+ return pand<Packet8us>(a, b);
1378
+ }
1379
+
1380
+ template <>
1381
+ EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1382
+ return vec_or(a, b);
1383
+ }
1384
+ template <>
1385
+ EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1386
+ return vec_or(a, b);
1387
+ }
1388
+ template <>
1389
+ EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1390
+ return vec_or(a, b);
1391
+ }
1392
+ template <>
1393
+ EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1394
+ return vec_or(a, b);
1395
+ }
1396
+ template <>
1397
+ EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1398
+ return por<Packet8us>(a, b);
1399
+ }
1400
+
1401
+ template <>
1402
+ EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1403
+ return vec_xor(a, b);
1404
+ }
1405
+ template <>
1406
+ EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1407
+ return vec_xor(a, b);
1408
+ }
1409
+ template <>
1410
+ EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1411
+ return vec_xor(a, b);
1412
+ }
1413
+ template <>
1414
+ EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1415
+ return pxor<Packet8us>(a, b);
1416
+ }
1417
+
1418
+ template <>
1419
+ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1420
+ return vec_andc(a, b);
1421
+ }
1422
+ template <>
1423
+ EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1424
+ return vec_andc(a, b);
1425
+ }
1426
+
1427
+ template <>
1428
+ EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
1429
+ return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
1430
+ }
1431
+
1432
+ template <>
1433
+ EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1434
+ Packet4f t = vec_add(
1435
+ reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1436
+ Packet4f res;
1437
+
1438
+ #ifdef EIGEN_VECTORIZE_VSX
1439
+ __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
1440
+ #else
1441
+ __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
1442
+ #endif
1443
+
1444
+ return res;
1445
+ }
1446
+ template <>
1447
+ EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1448
+ return vec_ceil(a);
1449
+ }
1450
+ template <>
1451
+ EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1452
+ return vec_floor(a);
1453
+ }
1454
+ template <>
1455
+ EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1456
+ return vec_trunc(a);
1457
+ }
1458
+ #ifdef EIGEN_VECTORIZE_VSX
1459
+ template <>
1460
+ EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1461
+ Packet4f res;
1462
+
1463
+ __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
1464
+
1465
+ return res;
1466
+ }
1467
+ #endif
1468
+
1469
+ template <typename Packet>
1470
+ EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
1471
+ EIGEN_DEBUG_UNALIGNED_LOAD
1472
+ #if defined(EIGEN_VECTORIZE_VSX)
1473
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1474
+ #else
1475
+ Packet16uc MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
1476
+ Packet16uc LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
1477
+ Packet16uc mask = vec_lvsl(0, from); // create the permute mask
1478
+ // TODO: Add static_cast here
1479
+ return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
1480
+ #endif
1481
+ }
1482
+
1483
+ template <>
1484
+ EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1485
+ return ploadu_common<Packet4f>(from);
1486
+ }
1487
+ template <>
1488
+ EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1489
+ return ploadu_common<Packet4i>(from);
1490
+ }
1491
+ template <>
1492
+ EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
1493
+ return ploadu_common<Packet8s>(from);
1494
+ }
1495
+ template <>
1496
+ EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
1497
+ return ploadu_common<Packet8us>(from);
1498
+ }
1499
+ template <>
1500
+ EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
1501
+ return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1502
+ }
1503
+ template <>
1504
+ EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
1505
+ return ploadu_common<Packet16c>(from);
1506
+ }
1507
+ template <>
1508
+ EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
1509
+ return ploadu_common<Packet16uc>(from);
1510
+ }
1511
+
1512
+ template <typename Packet>
1513
+ EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
1514
+ const Index offset) {
1515
+ const Index packet_size = unpacket_traits<Packet>::size;
1516
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
1517
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
1518
+ #ifdef _ARCH_PWR9
1519
+ EIGEN_UNUSED_VARIABLE(packet_size);
1520
+ EIGEN_DEBUG_ALIGNED_LOAD
1521
+ EIGEN_DEBUG_UNALIGNED_LOAD
1522
+ Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
1523
+ if (offset) {
1524
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1525
+ #ifdef _BIG_ENDIAN
1526
+ load = Packet(vec_sro(Packet16uc(load), shift));
1527
+ #else
1528
+ load = Packet(vec_slo(Packet16uc(load), shift));
1529
+ #endif
1530
+ }
1531
+ return load;
1532
+ #else
1533
+ if (n) {
1534
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1535
+ unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
1536
+ unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
1537
+ Index n2 = n * size;
1538
+ if (16 <= n2) {
1539
+ pstoreu(load2, ploadu<Packet16uc>(from2));
1540
+ } else {
1541
+ memcpy((void*)load2, (void*)from2, n2);
1542
+ }
1543
+ return pload_ignore<Packet>(load);
1544
+ } else {
1545
+ return Packet(pset1<Packet16uc>(0));
1546
+ }
1547
+ #endif
1548
+ }
1549
+
1550
+ template <>
1551
+ EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
1552
+ return ploadu_partial_common<Packet4f>(from, n, offset);
1553
+ }
1554
+ template <>
1555
+ EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
1556
+ return ploadu_partial_common<Packet4i>(from, n, offset);
1557
+ }
1558
+ template <>
1559
+ EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
1560
+ return ploadu_partial_common<Packet8s>(from, n, offset);
1561
+ }
1562
+ template <>
1563
+ EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
1564
+ const Index offset) {
1565
+ return ploadu_partial_common<Packet8us>(from, n, offset);
1566
+ }
1567
+ template <>
1568
+ EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
1569
+ return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
1570
+ }
1571
+ template <>
1572
+ EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
1573
+ return ploadu_partial_common<Packet16c>(from, n, offset);
1574
+ }
1575
+ template <>
1576
+ EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
1577
+ const Index offset) {
1578
+ return ploadu_partial_common<Packet16uc>(from, n, offset);
1579
+ }
1580
+
1581
+ template <typename Packet>
1582
+ EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
1583
+ Packet p;
1584
+ if ((std::ptrdiff_t(from) % 16) == 0)
1585
+ p = pload<Packet>(from);
1586
+ else
1587
+ p = ploadu<Packet>(from);
1588
+ return vec_mergeh(p, p);
1589
+ }
1590
+ template <>
1591
+ EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1592
+ return ploaddup_common<Packet4f>(from);
1593
+ }
1594
+ template <>
1595
+ EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1596
+ return ploaddup_common<Packet4i>(from);
1597
+ }
1598
+
1599
+ template <>
1600
+ EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
1601
+ Packet8s p;
1602
+ if ((std::ptrdiff_t(from) % 16) == 0)
1603
+ p = pload<Packet8s>(from);
1604
+ else
1605
+ p = ploadu<Packet8s>(from);
1606
+ return vec_mergeh(p, p);
1607
+ }
1608
+
1609
+ template <>
1610
+ EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
1611
+ Packet8us p;
1612
+ if ((std::ptrdiff_t(from) % 16) == 0)
1613
+ p = pload<Packet8us>(from);
1614
+ else
1615
+ p = ploadu<Packet8us>(from);
1616
+ return vec_mergeh(p, p);
1617
+ }
1618
+
1619
+ template <>
1620
+ EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
1621
+ Packet8s p;
1622
+ if ((std::ptrdiff_t(from) % 16) == 0)
1623
+ p = pload<Packet8s>(from);
1624
+ else
1625
+ p = ploadu<Packet8s>(from);
1626
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1627
+ }
1628
+
1629
+ template <>
1630
+ EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
1631
+ Packet8us p;
1632
+ if ((std::ptrdiff_t(from) % 16) == 0)
1633
+ p = pload<Packet8us>(from);
1634
+ else
1635
+ p = ploadu<Packet8us>(from);
1636
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1637
+ }
1638
+
1639
+ template <>
1640
+ EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
1641
+ return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1642
+ }
1643
+
1644
+ template <>
1645
+ EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
1646
+ Packet16c p;
1647
+ if ((std::ptrdiff_t(from) % 16) == 0)
1648
+ p = pload<Packet16c>(from);
1649
+ else
1650
+ p = ploadu<Packet16c>(from);
1651
+ return vec_mergeh(p, p);
1652
+ }
1653
+
1654
+ template <>
1655
+ EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
1656
+ Packet16uc p;
1657
+ if ((std::ptrdiff_t(from) % 16) == 0)
1658
+ p = pload<Packet16uc>(from);
1659
+ else
1660
+ p = ploadu<Packet16uc>(from);
1661
+ return vec_mergeh(p, p);
1662
+ }
1663
+
1664
+ template <>
1665
+ EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
1666
+ Packet16c p;
1667
+ if ((std::ptrdiff_t(from) % 16) == 0)
1668
+ p = pload<Packet16c>(from);
1669
+ else
1670
+ p = ploadu<Packet16c>(from);
1671
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1672
+ }
1673
+
1674
+ template <>
1675
+ EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
1676
+ Packet16uc p;
1677
+ if ((std::ptrdiff_t(from) % 16) == 0)
1678
+ p = pload<Packet16uc>(from);
1679
+ else
1680
+ p = ploadu<Packet16uc>(from);
1681
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1682
+ }
1683
+
1684
+ template <typename Packet>
1685
+ EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
1686
+ EIGEN_DEBUG_UNALIGNED_STORE
1687
+ #if defined(EIGEN_VECTORIZE_VSX)
1688
+ vec_xst(from, 0, to);
1689
+ #else
1690
+ // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1691
+ // Warning: not thread safe!
1692
+ Packet16uc MSQ, LSQ, edges;
1693
+ Packet16uc edgeAlign, align;
1694
+
1695
+ MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
1696
+ LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
1697
+ edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1698
+ edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
1699
+ align = vec_lvsr(0, to); // permute map to misalign data
1700
+ MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
1701
+ LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
1702
+ vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
1703
+ vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
1704
+ #endif
1705
+ }
1706
+ template <>
1707
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1708
+ pstoreu_common<Packet4f>(to, from);
1709
+ }
1710
+ template <>
1711
+ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1712
+ pstoreu_common<Packet4i>(to, from);
1713
+ }
1714
+ template <>
1715
+ EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
1716
+ pstoreu_common<Packet8s>(to, from);
1717
+ }
1718
+ template <>
1719
+ EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
1720
+ pstoreu_common<Packet8us>(to, from);
1721
+ }
1722
+ template <>
1723
+ EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
1724
+ pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
1725
+ }
1726
+ template <>
1727
+ EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
1728
+ pstoreu_common<Packet16c>(to, from);
1729
+ }
1730
+ template <>
1731
+ EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
1732
+ pstoreu_common<Packet16uc>(to, from);
1733
+ }
1734
+
1735
+ template <typename Packet>
1736
+ EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
1737
+ const Index offset) {
1738
+ const Index packet_size = unpacket_traits<Packet>::size;
1739
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
1740
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
1741
+ #ifdef _ARCH_PWR9
1742
+ EIGEN_UNUSED_VARIABLE(packet_size);
1743
+ EIGEN_DEBUG_UNALIGNED_STORE
1744
+ Packet store = from;
1745
+ if (offset) {
1746
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1747
+ #ifdef _BIG_ENDIAN
1748
+ store = Packet(vec_slo(Packet16uc(store), shift));
1749
+ #else
1750
+ store = Packet(vec_sro(Packet16uc(store), shift));
1751
+ #endif
1752
+ }
1753
+ vec_xst_len(store, to, n * size);
1754
+ #else
1755
+ if (n) {
1756
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1757
+ pstore(store, from);
1758
+ unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
1759
+ unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
1760
+ Index n2 = n * size;
1761
+ if (16 <= n2) {
1762
+ pstoreu(to2, ploadu<Packet16uc>(store2));
1763
+ } else {
1764
+ memcpy((void*)to2, (void*)store2, n2);
1765
+ }
1766
+ }
1767
+ #endif
1768
+ }
1769
+
1770
+ template <>
1771
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
1772
+ pstoreu_partial_common<Packet4f>(to, from, n, offset);
1773
+ }
1774
+ template <>
1775
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
1776
+ pstoreu_partial_common<Packet4i>(to, from, n, offset);
1777
+ }
1778
+ template <>
1779
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
1780
+ const Index offset) {
1781
+ pstoreu_partial_common<Packet8s>(to, from, n, offset);
1782
+ }
1783
+ template <>
1784
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
1785
+ const Index n, const Index offset) {
1786
+ pstoreu_partial_common<Packet8us>(to, from, n, offset);
1787
+ }
1788
+ template <>
1789
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
1790
+ const Index offset) {
1791
+ pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
1792
+ }
1793
+ template <>
1794
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
1795
+ const Index offset) {
1796
+ pstoreu_partial_common<Packet16c>(to, from, n, offset);
1797
+ }
1798
+ template <>
1799
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
1800
+ const Index offset) {
1801
+ pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1802
+ }
1803
+
1804
+ template <>
1805
+ EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1806
+ EIGEN_PPC_PREFETCH(addr);
1807
+ }
1808
+ template <>
1809
+ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1810
+ EIGEN_PPC_PREFETCH(addr);
1811
+ }
1812
+
1813
+ template <>
1814
+ EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1815
+ EIGEN_ALIGN16 float x;
1816
+ vec_ste(a, 0, &x);
1817
+ return x;
1818
+ }
1819
+ template <>
1820
+ EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1821
+ EIGEN_ALIGN16 int x;
1822
+ vec_ste(a, 0, &x);
1823
+ return x;
1824
+ }
1825
+
1826
+ template <typename Packet>
1827
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1828
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1829
+ vec_ste(a, 0, &x);
1830
+ return x;
1831
+ }
1832
+
1833
+ template <>
1834
+ EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1835
+ return pfirst_common<Packet8s>(a);
1836
+ }
1837
+
1838
+ template <>
1839
+ EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1840
+ return pfirst_common<Packet8us>(a);
1841
+ }
1842
+
1843
+ template <>
1844
+ EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
1845
+ return pfirst_common<Packet16c>(a);
1846
+ }
1847
+
1848
+ template <>
1849
+ EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
1850
+ return pfirst_common<Packet16uc>(a);
1851
+ }
1852
+
1853
+ template <>
1854
+ EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1855
+ return reinterpret_cast<Packet4f>(
1856
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1857
+ }
1858
+ template <>
1859
+ EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1860
+ return reinterpret_cast<Packet4i>(
1861
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1862
+ }
1863
+ template <>
1864
+ EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1865
+ return reinterpret_cast<Packet8s>(
1866
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1867
+ }
1868
+ template <>
1869
+ EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1870
+ return reinterpret_cast<Packet8us>(
1871
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1872
+ }
1873
+ template <>
1874
+ EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1875
+ return vec_perm(a, a, p16uc_REVERSE8);
1876
+ }
1877
+ template <>
1878
+ EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1879
+ return vec_perm(a, a, p16uc_REVERSE8);
1880
+ }
1881
+ template <>
1882
+ EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
1883
+ return preverse<Packet8us>(a);
1884
+ }
1885
+
1886
+ template <>
1887
+ EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1888
+ return vec_abs(a);
1889
+ }
1890
+ template <>
1891
+ EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1892
+ return vec_abs(a);
1893
+ }
1894
+ template <>
1895
+ EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1896
+ return vec_abs(a);
1897
+ }
1898
+ template <>
1899
+ EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1900
+ return a;
1901
+ }
1902
+ template <>
1903
+ EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1904
+ return vec_abs(a);
1905
+ }
1906
+ template <>
1907
+ EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1908
+ return a;
1909
+ }
1910
+ template <>
1911
+ EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1912
+ EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1913
+ return pand<Packet8us>(p8us_abs_mask, a);
1914
+ }
1915
+
1916
+ template <>
1917
+ EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
1918
+ return vec_sra(a.m_val, vec_splat_u16(15));
1919
+ }
1920
+ template <>
1921
+ EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1922
+ return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
1923
+ }
1924
+
1925
+ template <int N>
1926
+ EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1927
+ return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1928
+ }
1929
+ template <int N>
1930
+ EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1931
+ return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1932
+ }
1933
+ template <int N>
1934
+ EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1935
+ return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1936
+ }
1937
+ template <int N>
1938
+ EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
1939
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1940
+ Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1941
+ return reinterpret_cast<Packet4f>(r);
1942
+ }
1943
+
1944
+ template <int N>
1945
+ EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
1946
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1947
+ Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1948
+ return reinterpret_cast<Packet4f>(r);
1949
+ }
1950
+
1951
+ template <int N>
1952
+ EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1953
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1954
+ return vec_sr(a, p4ui_mask);
1955
+ }
1956
+
1957
+ template <int N>
1958
+ EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1959
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1960
+ return vec_sl(a, p4ui_mask);
1961
+ }
1962
+
1963
+ template <int N>
1964
+ EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1965
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1966
+ return vec_sl(a, p8us_mask);
1967
+ }
1968
+ template <int N>
1969
+ EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1970
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1971
+ return vec_sr(a, p8us_mask);
1972
+ }
1973
+
1974
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
1975
+ return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1976
+ }
1977
+
1978
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
1979
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1980
+ return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
1981
+ }
1982
+
1983
+ EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1984
+ #ifdef _BIG_ENDIAN
1985
+ return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
1986
+ #else
1987
+ return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
1988
+ #endif
1989
+ }
1990
+
1991
+ // Simple interleaving of bool masks, prevents true values from being
1992
+ // converted to NaNs.
1993
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1994
+ return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
1995
+ }
1996
+
1997
+ // #define SUPPORT_BF16_SUBNORMALS
1998
+
1999
+ #ifndef __VEC_CLASS_FP_NAN
2000
+ #define __VEC_CLASS_FP_NAN (1 << 6)
2001
+ #endif
2002
+
2003
+ #if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
2004
+ #define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2005
+ #define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2006
+
2007
+ #define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2008
+ #endif
2009
+
2010
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2011
+ #ifdef _ARCH_PWR10
2012
+ return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
2013
+ #else
2014
+ Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
2015
+ Packet4ui lsb = plogical_shift_right<16>(input);
2016
+ lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
2017
+
2018
+ EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
2019
+ Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
2020
+ input = padd<Packet4ui>(input, rounding_bias);
2021
+
2022
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2023
+ #if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2024
+ Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2025
+ input = vec_sel(input, p4ui_nan, nan_selector);
2026
+
2027
+ #ifdef SUPPORT_BF16_SUBNORMALS
2028
+ Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2029
+ input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2030
+ #endif
2031
+ #else
2032
+ #ifdef SUPPORT_BF16_SUBNORMALS
2033
+ // Test NaN and Subnormal
2034
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2035
+ Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
2036
+
2037
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2038
+ Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
2039
+
2040
+ Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
2041
+ Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
2042
+
2043
+ Packet4ui nan_selector =
2044
+ pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2045
+
2046
+ Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
2047
+
2048
+ Packet4ui subnormal_selector =
2049
+ pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2050
+
2051
+ input = vec_sel(input, p4ui_nan, nan_selector);
2052
+ input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2053
+ #else
2054
+ // Test only NaN
2055
+ Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2056
+
2057
+ input = vec_sel(p4ui_nan, input, nan_selector);
2058
+ #endif
2059
+ #endif
2060
+
2061
+ input = plogical_shift_right<16>(input);
2062
+ return reinterpret_cast<Packet8us>(input);
2063
+ #endif
2064
+ }
2065
+
2066
+ #ifdef _BIG_ENDIAN
2067
+ /**
2068
+ * Pack the high portion of two float Packets into one bfloat16 Packet
2069
+ *
2070
+ * @tparam lohi to expect either a low & high OR odd & even order
2071
+ */
2072
+ template <bool lohi>
2073
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2074
+ if (lohi) {
2075
+ return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
2076
+ } else {
2077
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2078
+ }
2079
+ }
2080
+
2081
+ /**
2082
+ * Pack the low portion of two float Packets into one bfloat16 Packet
2083
+ *
2084
+ * @param lohi to expect either a low & high OR odd & even order
2085
+ */
2086
+ template <bool lohi>
2087
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2088
+ if (lohi) {
2089
+ return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
2090
+ } else {
2091
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2092
+ }
2093
+ }
2094
+ #else
2095
+ template <bool lohi>
2096
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2097
+ if (lohi) {
2098
+ return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
2099
+ } else {
2100
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2101
+ }
2102
+ }
2103
+
2104
+ template <bool lohi>
2105
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2106
+ if (lohi) {
2107
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
2108
+ } else {
2109
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2110
+ }
2111
+ }
2112
+ #endif
2113
+
2114
+ /**
2115
+ * Convert and pack two float Packets into one bfloat16 Packet
2116
+ *
2117
+ * @tparam lohi to expect either a low & high OR odd & even order
2118
+ */
2119
+ template <bool lohi = true>
2120
+ EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2121
+ Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2122
+ Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2123
+
2124
+ Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2125
+ EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2126
+ lsb = padd<Packet8us>(lsb, p8us_BIAS);
2127
+ lsb = padd<Packet8us>(lsb, p4f2);
2128
+
2129
+ Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2130
+ Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
2131
+
2132
+ #if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2133
+ Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2134
+ Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2135
+ Packet8us nan_selector =
2136
+ Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2137
+
2138
+ input = vec_sel(input, p8us_BIAS, nan_selector);
2139
+
2140
+ #ifdef SUPPORT_BF16_SUBNORMALS
2141
+ Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2142
+ Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2143
+ Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
2144
+ reinterpret_cast<Packet4f>(subnormal_selector_hi));
2145
+
2146
+ input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2147
+ #endif
2148
+ #else
2149
+ #ifdef SUPPORT_BF16_SUBNORMALS
2150
+ // Test NaN and Subnormal
2151
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2152
+ Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
2153
+
2154
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2155
+ Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2156
+
2157
+ Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
2158
+ Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
2159
+
2160
+ Packet8us nan_selector =
2161
+ pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2162
+
2163
+ Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
2164
+
2165
+ Packet8us subnormal_selector =
2166
+ pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2167
+
2168
+ // Using BIAS as NaN (since any or all of the last 7 bits can be set)
2169
+ input = vec_sel(input, p8us_BIAS, nan_selector);
2170
+ input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2171
+ #else
2172
+ // Test only NaN
2173
+ Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2174
+ Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2175
+ Packet8us nan_selector =
2176
+ Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2177
+
2178
+ input = vec_sel(p8us_BIAS, input, nan_selector);
2179
+ #endif
2180
+ #endif
2181
+
2182
+ return input;
2183
+ }
2184
+
2185
+ /**
2186
+ * Convert and pack two float Packets into one bfloat16 Packet - low & high order
2187
+ */
2188
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2189
+ #ifdef _ARCH_PWR10
2190
+ Packet8bf fp16_0 = F32ToBf16(lo);
2191
+ Packet8bf fp16_1 = F32ToBf16(hi);
2192
+ return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
2193
+ #else
2194
+ return F32ToBf16Two(lo, hi);
2195
+ #endif
2196
+ }
2197
+
2198
+ /**
2199
+ * Convert and pack two float Packets into one bfloat16 Packet - odd & even order
2200
+ */
2201
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2202
+ #ifdef _ARCH_PWR10
2203
+ return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
2204
+ #else
2205
+ return F32ToBf16Two<false>(even, odd);
2206
+ #endif
2207
+ }
2208
+ #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
2209
+ Packet4f a_even = Bf16ToF32Even(A); \
2210
+ Packet4f a_odd = Bf16ToF32Odd(A); \
2211
+ Packet4f op_even = OP(a_even); \
2212
+ Packet4f op_odd = OP(a_odd); \
2213
+ return F32ToBf16(op_even, op_odd);
2214
+
2215
+ #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
2216
+ Packet4f a_even = Bf16ToF32Even(A); \
2217
+ Packet4f a_odd = Bf16ToF32Odd(A); \
2218
+ Packet4f b_even = Bf16ToF32Even(B); \
2219
+ Packet4f b_odd = Bf16ToF32Odd(B); \
2220
+ Packet4f op_even = OP(a_even, b_even); \
2221
+ Packet4f op_odd = OP(a_odd, b_odd); \
2222
+ return F32ToBf16(op_even, op_odd);
2223
+
2224
+ #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
2225
+ Packet4f a_even = Bf16ToF32Even(A); \
2226
+ Packet4f a_odd = Bf16ToF32Odd(A); \
2227
+ Packet4f b_even = Bf16ToF32Even(B); \
2228
+ Packet4f b_odd = Bf16ToF32Odd(B); \
2229
+ Packet4f op_even = OP(a_even, b_even); \
2230
+ Packet4f op_odd = OP(a_odd, b_odd); \
2231
+ return F32ToBf16Bool(op_even, op_odd);
2232
+
2233
+ template <>
2234
+ EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2235
+ BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
2236
+ }
2237
+
2238
+ template <>
2239
+ EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2240
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
2241
+ }
2242
+
2243
+ template <>
2244
+ EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2245
+ BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
2246
+ }
2247
+
2248
+ template <>
2249
+ EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
2250
+ EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2251
+ return pxor<Packet8us>(p8us_neg_mask, a);
2252
+ }
2253
+
2254
+ template <>
2255
+ EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2256
+ BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
2257
+ }
2258
+
2259
+ template <>
2260
+ EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
2261
+ BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
2262
+ }
2263
+
2264
+ template <>
2265
+ EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
2266
+ BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2267
+ }
2268
+
2269
+ template <>
2270
+ EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2271
+ return pldexp_generic(a, exponent);
2272
+ }
2273
+ template <>
2274
+ EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
2275
+ BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
2276
+ }
2277
+
2278
+ template <>
2279
+ EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2280
+ return pfrexp_generic(a, exponent);
2281
+ }
2282
+ template <>
2283
+ EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
2284
+ Packet4f a_even = Bf16ToF32Even(a);
2285
+ Packet4f a_odd = Bf16ToF32Odd(a);
2286
+ Packet4f e_even;
2287
+ Packet4f e_odd;
2288
+ Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
2289
+ Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
2290
+ e = F32ToBf16(e_even, e_odd);
2291
+ return F32ToBf16(op_even, op_odd);
2292
+ }
2293
+
2294
+ template <>
2295
+ EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
2296
+ BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
2297
+ }
2298
+ template <>
2299
+ EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
2300
+ BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
2301
+ }
2302
+ template <>
2303
+ EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
2304
+ BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
2305
+ }
2306
+ template <>
2307
+ EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
2308
+ BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
2309
+ }
2310
+ template <>
2311
+ EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
2312
+ BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
2313
+ }
2314
+ template <>
2315
+ EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
2316
+ BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
2317
+ }
2318
+ template <>
2319
+ EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
2320
+ BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2321
+ }
2322
+ #ifdef EIGEN_VECTORIZE_VSX
2323
+ template <>
2324
+ EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
2325
+ BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
2326
+ }
2327
+ #endif
2328
+ template <>
2329
+ EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2330
+ Packet4f a_even = Bf16ToF32Even(a);
2331
+ Packet4f a_odd = Bf16ToF32Odd(a);
2332
+ Packet4f b_even = Bf16ToF32Even(b);
2333
+ Packet4f b_odd = Bf16ToF32Odd(b);
2334
+ Packet4f c_even = Bf16ToF32Even(c);
2335
+ Packet4f c_odd = Bf16ToF32Odd(c);
2336
+ Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
2337
+ Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
2338
+ return F32ToBf16(pmadd_even, pmadd_odd);
2339
+ }
2340
+
2341
+ template <>
2342
+ EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2343
+ Packet4f a_even = Bf16ToF32Even(a);
2344
+ Packet4f a_odd = Bf16ToF32Odd(a);
2345
+ Packet4f b_even = Bf16ToF32Even(b);
2346
+ Packet4f b_odd = Bf16ToF32Odd(b);
2347
+ Packet4f c_even = Bf16ToF32Even(c);
2348
+ Packet4f c_odd = Bf16ToF32Odd(c);
2349
+ Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2350
+ Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2351
+ return F32ToBf16(pmadd_even, pmadd_odd);
2352
+ }
2353
+ template <>
2354
+ EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2355
+ Packet4f a_even = Bf16ToF32Even(a);
2356
+ Packet4f a_odd = Bf16ToF32Odd(a);
2357
+ Packet4f b_even = Bf16ToF32Even(b);
2358
+ Packet4f b_odd = Bf16ToF32Odd(b);
2359
+ Packet4f c_even = Bf16ToF32Even(c);
2360
+ Packet4f c_odd = Bf16ToF32Odd(c);
2361
+ Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2362
+ Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2363
+ return F32ToBf16(pmadd_even, pmadd_odd);
2364
+ }
2365
+
2366
+ template <>
2367
+ EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2368
+ Packet4f a_even = Bf16ToF32Even(a);
2369
+ Packet4f a_odd = Bf16ToF32Odd(a);
2370
+ Packet4f b_even = Bf16ToF32Even(b);
2371
+ Packet4f b_odd = Bf16ToF32Odd(b);
2372
+ Packet4f c_even = Bf16ToF32Even(c);
2373
+ Packet4f c_odd = Bf16ToF32Odd(c);
2374
+ Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2375
+ Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2376
+ return F32ToBf16(pmadd_even, pmadd_odd);
2377
+ }
2378
+
2379
+ template <>
2380
+ EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2381
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
2382
+ }
2383
+
2384
+ template <>
2385
+ EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2386
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
2387
+ }
2388
+
2389
+ template <>
2390
+ EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2391
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
2392
+ }
2393
+ template <>
2394
+ EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2395
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
2396
+ }
2397
+ template <>
2398
+ EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2399
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
2400
+ }
2401
+ template <>
2402
+ EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2403
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
2404
+ }
2405
+
2406
+ template <>
2407
+ EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
2408
+ return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
2409
+ }
2410
+
2411
+ template <>
2412
+ EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
2413
+ return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
2414
+ }
2415
+
2416
+ template <>
2417
+ EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2418
+ bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2419
+ bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
2420
+ return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2421
+ }
2422
+
2423
+ template <>
2424
+ EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
2425
+ Packet4f b, sum;
2426
+ b = vec_sld(a, a, 8);
2427
+ sum = a + b;
2428
+ b = vec_sld(sum, sum, 4);
2429
+ sum += b;
2430
+ return pfirst(sum);
2431
+ }
2432
+
2433
+ template <>
2434
+ EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
2435
+ Packet4i b, sum;
2436
+ b = vec_sld(a, a, 8);
2437
+ sum = a + b;
2438
+ b = vec_sld(sum, sum, 4);
2439
+ sum += b;
2440
+ return pfirst(sum);
2441
+ }
2442
+
2443
+ template <>
2444
+ EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
2445
+ float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2446
+ float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2447
+ float f32_result = redux_even + redux_odd;
2448
+ return bfloat16(f32_result);
2449
+ }
2450
+ template <typename Packet>
2451
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
2452
+ union {
2453
+ Packet v;
2454
+ __UNPACK_TYPE__(Packet) n[8];
2455
+ } vt;
2456
+ vt.v = a;
2457
+
2458
+ EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2459
+ EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2460
+ Packet4i first_half = pload<Packet4i>(first_loader);
2461
+ Packet4i second_half = pload<Packet4i>(second_loader);
2462
+
2463
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
2464
+ }
2465
+
2466
+ template <>
2467
+ EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
2468
+ return predux_size8<Packet8s>(a);
2469
+ }
2470
+
2471
+ template <>
2472
+ EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
2473
+ return predux_size8<Packet8us>(a);
2474
+ }
2475
+
2476
+ template <typename Packet>
2477
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
2478
+ union {
2479
+ Packet v;
2480
+ __UNPACK_TYPE__(Packet) n[16];
2481
+ } vt;
2482
+ vt.v = a;
2483
+
2484
+ EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2485
+ EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2486
+ EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2487
+ EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
2488
+
2489
+ Packet4i first_quarter = pload<Packet4i>(first_loader);
2490
+ Packet4i second_quarter = pload<Packet4i>(second_loader);
2491
+ Packet4i third_quarter = pload<Packet4i>(third_loader);
2492
+ Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2493
+
2494
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2495
+ predux(fourth_quarter));
2496
+ }
2497
+
2498
+ template <>
2499
+ EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
2500
+ return predux_size16<Packet16c>(a);
2501
+ }
2502
+
2503
+ template <>
2504
+ EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
2505
+ return predux_size16<Packet16uc>(a);
2506
+ }
2507
+
2508
+ // Other reduction functions:
2509
+ // mul
2510
+ template <>
2511
+ EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
2512
+ Packet4f prod;
2513
+ prod = pmul(a, vec_sld(a, a, 8));
2514
+ return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2515
+ }
2516
+
2517
+ template <>
2518
+ EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
2519
+ EIGEN_ALIGN16 int aux[4];
2520
+ pstore(aux, a);
2521
+ return aux[0] * aux[1] * aux[2] * aux[3];
2522
+ }
2523
+
2524
+ template <>
2525
+ EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
2526
+ Packet8s pair, quad, octo;
2527
+
2528
+ pair = vec_mul(a, vec_sld(a, a, 8));
2529
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
2530
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
2531
+
2532
+ return pfirst(octo);
2533
+ }
2534
+
2535
+ template <>
2536
+ EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
2537
+ Packet8us pair, quad, octo;
2538
+
2539
+ pair = vec_mul(a, vec_sld(a, a, 8));
2540
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
2541
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
2542
+
2543
+ return pfirst(octo);
2544
+ }
2545
+
2546
+ template <>
2547
+ EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
2548
+ float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2549
+ float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2550
+ float f32_result = redux_even * redux_odd;
2551
+ return bfloat16(f32_result);
2552
+ }
2553
+
2554
+ template <>
2555
+ EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
2556
+ Packet16c pair, quad, octo, result;
2557
+
2558
+ pair = vec_mul(a, vec_sld(a, a, 8));
2559
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
2560
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
2561
+ result = vec_mul(octo, vec_sld(octo, octo, 1));
2562
+
2563
+ return pfirst(result);
2564
+ }
2565
+
2566
+ template <>
2567
+ EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
2568
+ Packet16uc pair, quad, octo, result;
2569
+
2570
+ pair = vec_mul(a, vec_sld(a, a, 8));
2571
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
2572
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
2573
+ result = vec_mul(octo, vec_sld(octo, octo, 1));
2574
+
2575
+ return pfirst(result);
2576
+ }
2577
+
2578
+ // min
2579
+ template <typename Packet>
2580
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
2581
+ Packet b, res;
2582
+ b = vec_min(a, vec_sld(a, a, 8));
2583
+ res = vec_min(b, vec_sld(b, b, 4));
2584
+ return pfirst(res);
2585
+ }
2586
+
2587
+ template <>
2588
+ EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2589
+ return predux_min4<Packet4f>(a);
2590
+ }
2591
+
2592
+ template <>
2593
+ EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
2594
+ return predux_min4<Packet4i>(a);
2595
+ }
2596
+
2597
+ template <>
2598
+ EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
2599
+ float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2600
+ float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2601
+ float f32_result = (std::min)(redux_even, redux_odd);
2602
+ return bfloat16(f32_result);
2603
+ }
2604
+
2605
+ template <>
2606
+ EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
2607
+ Packet8s pair, quad, octo;
2608
+
2609
+ // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2610
+ pair = vec_min(a, vec_sld(a, a, 8));
2611
+
2612
+ // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2613
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
2614
+
2615
+ // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2616
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
2617
+ return pfirst(octo);
2618
+ }
2619
+
2620
+ template <>
2621
+ EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
2622
+ Packet8us pair, quad, octo;
2623
+
2624
+ // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2625
+ pair = vec_min(a, vec_sld(a, a, 8));
2626
+
2627
+ // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2628
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
2629
+
2630
+ // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2631
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
2632
+ return pfirst(octo);
481
2633
  }
482
2634
 
483
- #ifdef _BIG_ENDIAN
484
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
485
- {
486
- EIGEN_DEBUG_UNALIGNED_STORE
487
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
488
- // Warning: not thread safe!
489
- Packet16uc MSQ, LSQ, edges;
490
- Packet16uc edgeAlign, align;
2635
+ template <>
2636
+ EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
2637
+ Packet16c pair, quad, octo, result;
491
2638
 
492
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
493
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
494
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
495
- edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
496
- align = vec_lvsr( 0, to ); // permute map to misalign data
497
- MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
498
- LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
499
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
500
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
501
- }
502
- template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
503
- {
504
- EIGEN_DEBUG_UNALIGNED_STORE
505
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
506
- // Warning: not thread safe!
507
- Packet16uc MSQ, LSQ, edges;
508
- Packet16uc edgeAlign, align;
2639
+ pair = vec_min(a, vec_sld(a, a, 8));
2640
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
2641
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
2642
+ result = vec_min(octo, vec_sld(octo, octo, 1));
509
2643
 
510
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
511
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
512
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
513
- edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
514
- align = vec_lvsr( 0, to ); // permute map to misalign data
515
- MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
516
- LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
517
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
518
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
2644
+ return pfirst(result);
519
2645
  }
520
- #else
521
- // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
522
- template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
523
- {
524
- EIGEN_DEBUG_ALIGNED_STORE
525
- vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
2646
+
2647
+ template <>
2648
+ EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
2649
+ Packet16uc pair, quad, octo, result;
2650
+
2651
+ pair = vec_min(a, vec_sld(a, a, 8));
2652
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
2653
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
2654
+ result = vec_min(octo, vec_sld(octo, octo, 1));
2655
+
2656
+ return pfirst(result);
526
2657
  }
527
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
528
- {
529
- EIGEN_DEBUG_ALIGNED_STORE
530
- vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
2658
+ // max
2659
+ template <typename Packet>
2660
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
2661
+ Packet b, res;
2662
+ b = vec_max(a, vec_sld(a, a, 8));
2663
+ res = vec_max(b, vec_sld(b, b, 4));
2664
+ return pfirst(res);
531
2665
  }
532
- #endif
533
2666
 
534
- template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
535
- template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
2667
+ template <>
2668
+ EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2669
+ return predux_max4<Packet4f>(a);
2670
+ }
536
2671
 
537
- template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
538
- template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
2672
+ template <>
2673
+ EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
2674
+ return predux_max4<Packet4i>(a);
2675
+ }
539
2676
 
540
- template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
541
- {
542
- return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
2677
+ template <>
2678
+ EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
2679
+ float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2680
+ float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2681
+ float f32_result = (std::max)(redux_even, redux_odd);
2682
+ return bfloat16(f32_result);
543
2683
  }
544
- template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
545
- {
546
- return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
547
2684
 
548
- template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
549
- template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
2685
+ template <>
2686
+ EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
2687
+ Packet8s pair, quad, octo;
550
2688
 
551
- template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
552
- {
553
- Packet4f b, sum;
554
- b = vec_sld(a, a, 8);
555
- sum = a + b;
556
- b = vec_sld(sum, sum, 4);
557
- sum += b;
558
- return pfirst(sum);
559
- }
2689
+ // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2690
+ pair = vec_max(a, vec_sld(a, a, 8));
560
2691
 
561
- template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
562
- {
563
- Packet4f v[4], sum[4];
564
-
565
- // It's easier and faster to transpose then add as columns
566
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
567
- // Do the transpose, first set of moves
568
- v[0] = vec_mergeh(vecs[0], vecs[2]);
569
- v[1] = vec_mergel(vecs[0], vecs[2]);
570
- v[2] = vec_mergeh(vecs[1], vecs[3]);
571
- v[3] = vec_mergel(vecs[1], vecs[3]);
572
- // Get the resulting vectors
573
- sum[0] = vec_mergeh(v[0], v[2]);
574
- sum[1] = vec_mergel(v[0], v[2]);
575
- sum[2] = vec_mergeh(v[1], v[3]);
576
- sum[3] = vec_mergel(v[1], v[3]);
577
-
578
- // Now do the summation:
579
- // Lines 0+1
580
- sum[0] = sum[0] + sum[1];
581
- // Lines 2+3
582
- sum[1] = sum[2] + sum[3];
583
- // Add the results
584
- sum[0] = sum[0] + sum[1];
585
-
586
- return sum[0];
587
- }
588
-
589
- template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
590
- {
591
- Packet4i sum;
592
- sum = vec_sums(a, p4i_ZERO);
593
- #ifdef _BIG_ENDIAN
594
- sum = vec_sld(sum, p4i_ZERO, 12);
595
- #else
596
- sum = vec_sld(p4i_ZERO, sum, 4);
597
- #endif
598
- return pfirst(sum);
2692
+ // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2693
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
2694
+
2695
+ // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2696
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
2697
+ return pfirst(octo);
599
2698
  }
600
2699
 
601
- template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
602
- {
603
- Packet4i v[4], sum[4];
2700
+ template <>
2701
+ EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
2702
+ Packet8us pair, quad, octo;
604
2703
 
605
- // It's easier and faster to transpose then add as columns
606
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
607
- // Do the transpose, first set of moves
608
- v[0] = vec_mergeh(vecs[0], vecs[2]);
609
- v[1] = vec_mergel(vecs[0], vecs[2]);
610
- v[2] = vec_mergeh(vecs[1], vecs[3]);
611
- v[3] = vec_mergel(vecs[1], vecs[3]);
612
- // Get the resulting vectors
613
- sum[0] = vec_mergeh(v[0], v[2]);
614
- sum[1] = vec_mergel(v[0], v[2]);
615
- sum[2] = vec_mergeh(v[1], v[3]);
616
- sum[3] = vec_mergel(v[1], v[3]);
2704
+ // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2705
+ pair = vec_max(a, vec_sld(a, a, 8));
617
2706
 
618
- // Now do the summation:
619
- // Lines 0+1
620
- sum[0] = sum[0] + sum[1];
621
- // Lines 2+3
622
- sum[1] = sum[2] + sum[3];
623
- // Add the results
624
- sum[0] = sum[0] + sum[1];
2707
+ // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2708
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
625
2709
 
626
- return sum[0];
2710
+ // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2711
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
2712
+ return pfirst(octo);
627
2713
  }
628
2714
 
629
- // Other reduction functions:
630
- // mul
631
- template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
632
- {
633
- Packet4f prod;
634
- prod = pmul(a, vec_sld(a, a, 8));
635
- return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2715
+ template <>
2716
+ EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
2717
+ Packet16c pair, quad, octo, result;
2718
+
2719
+ pair = vec_max(a, vec_sld(a, a, 8));
2720
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
2721
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
2722
+ result = vec_max(octo, vec_sld(octo, octo, 1));
2723
+
2724
+ return pfirst(result);
636
2725
  }
637
2726
 
638
- template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
639
- {
640
- EIGEN_ALIGN16 int aux[4];
641
- pstore(aux, a);
642
- return aux[0] * aux[1] * aux[2] * aux[3];
2727
+ template <>
2728
+ EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
2729
+ Packet16uc pair, quad, octo, result;
2730
+
2731
+ pair = vec_max(a, vec_sld(a, a, 8));
2732
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
2733
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
2734
+ result = vec_max(octo, vec_sld(octo, octo, 1));
2735
+
2736
+ return pfirst(result);
643
2737
  }
644
2738
 
645
- // min
646
- template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
647
- {
648
- Packet4f b, res;
649
- b = vec_min(a, vec_sld(a, a, 8));
650
- res = vec_min(b, vec_sld(b, b, 4));
651
- return pfirst(res);
2739
+ template <>
2740
+ EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
2741
+ return vec_any_ne(x, pzero(x));
652
2742
  }
653
2743
 
654
- template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
655
- {
656
- Packet4i b, res;
657
- b = vec_min(a, vec_sld(a, a, 8));
658
- res = vec_min(b, vec_sld(b, b, 4));
659
- return pfirst(res);
2744
+ template <typename T>
2745
+ EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
2746
+ T t0, t1, t2, t3;
2747
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2748
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2749
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2750
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2751
+ kernel.packet[0] = vec_mergeh(t0, t2);
2752
+ kernel.packet[1] = vec_mergel(t0, t2);
2753
+ kernel.packet[2] = vec_mergeh(t1, t3);
2754
+ kernel.packet[3] = vec_mergel(t1, t3);
660
2755
  }
661
2756
 
662
- // max
663
- template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
664
- {
665
- Packet4f b, res;
666
- b = vec_max(a, vec_sld(a, a, 8));
667
- res = vec_max(b, vec_sld(b, b, 4));
668
- return pfirst(res);
2757
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
2758
+
2759
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
2760
+
2761
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2762
+ Packet8s t0, t1, t2, t3;
2763
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2764
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2765
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2766
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2767
+ kernel.packet[0] = vec_mergeh(t0, t2);
2768
+ kernel.packet[1] = vec_mergel(t0, t2);
2769
+ kernel.packet[2] = vec_mergeh(t1, t3);
2770
+ kernel.packet[3] = vec_mergel(t1, t3);
669
2771
  }
670
2772
 
671
- template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
672
- {
673
- Packet4i b, res;
674
- b = vec_max(a, vec_sld(a, a, 8));
675
- res = vec_max(b, vec_sld(b, b, 4));
676
- return pfirst(res);
2773
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2774
+ Packet8us t0, t1, t2, t3;
2775
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2776
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2777
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2778
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2779
+ kernel.packet[0] = vec_mergeh(t0, t2);
2780
+ kernel.packet[1] = vec_mergel(t0, t2);
2781
+ kernel.packet[2] = vec_mergeh(t1, t3);
2782
+ kernel.packet[3] = vec_mergel(t1, t3);
677
2783
  }
678
2784
 
679
- template<int Offset>
680
- struct palign_impl<Offset,Packet4f>
681
- {
682
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
683
- {
684
- #ifdef _BIG_ENDIAN
685
- switch (Offset % 4) {
686
- case 1:
687
- first = vec_sld(first, second, 4); break;
688
- case 2:
689
- first = vec_sld(first, second, 8); break;
690
- case 3:
691
- first = vec_sld(first, second, 12); break;
692
- }
693
- #else
694
- switch (Offset % 4) {
695
- case 1:
696
- first = vec_sld(second, first, 12); break;
697
- case 2:
698
- first = vec_sld(second, first, 8); break;
699
- case 3:
700
- first = vec_sld(second, first, 4); break;
701
- }
702
- #endif
703
- }
704
- };
2785
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2786
+ Packet8us t0, t1, t2, t3;
705
2787
 
706
- template<int Offset>
707
- struct palign_impl<Offset,Packet4i>
708
- {
709
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
710
- {
711
- #ifdef _BIG_ENDIAN
712
- switch (Offset % 4) {
713
- case 1:
714
- first = vec_sld(first, second, 4); break;
715
- case 2:
716
- first = vec_sld(first, second, 8); break;
717
- case 3:
718
- first = vec_sld(first, second, 12); break;
719
- }
720
- #else
721
- switch (Offset % 4) {
722
- case 1:
723
- first = vec_sld(second, first, 12); break;
724
- case 2:
725
- first = vec_sld(second, first, 8); break;
726
- case 3:
727
- first = vec_sld(second, first, 4); break;
728
- }
729
- #endif
730
- }
731
- };
2788
+ t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2789
+ t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2790
+ t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2791
+ t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2792
+ kernel.packet[0] = vec_mergeh(t0, t2);
2793
+ kernel.packet[1] = vec_mergel(t0, t2);
2794
+ kernel.packet[2] = vec_mergeh(t1, t3);
2795
+ kernel.packet[3] = vec_mergel(t1, t3);
2796
+ }
732
2797
 
733
- EIGEN_DEVICE_FUNC inline void
734
- ptranspose(PacketBlock<Packet4f,4>& kernel) {
735
- Packet4f t0, t1, t2, t3;
2798
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2799
+ Packet16c t0, t1, t2, t3;
736
2800
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
737
2801
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
738
2802
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -743,9 +2807,8 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
743
2807
  kernel.packet[3] = vec_mergel(t1, t3);
744
2808
  }
745
2809
 
746
- EIGEN_DEVICE_FUNC inline void
747
- ptranspose(PacketBlock<Packet4i,4>& kernel) {
748
- Packet4i t0, t1, t2, t3;
2810
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2811
+ Packet16uc t0, t1, t2, t3;
749
2812
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
750
2813
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
751
2814
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -756,91 +2819,407 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
756
2819
  kernel.packet[3] = vec_mergel(t1, t3);
757
2820
  }
758
2821
 
759
- template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
760
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
761
- Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
2822
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2823
+ Packet8s v[8], sum[8];
2824
+
2825
+ v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2826
+ v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2827
+ v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2828
+ v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2829
+ v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2830
+ v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2831
+ v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2832
+ v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2833
+ sum[0] = vec_mergeh(v[0], v[4]);
2834
+ sum[1] = vec_mergel(v[0], v[4]);
2835
+ sum[2] = vec_mergeh(v[1], v[5]);
2836
+ sum[3] = vec_mergel(v[1], v[5]);
2837
+ sum[4] = vec_mergeh(v[2], v[6]);
2838
+ sum[5] = vec_mergel(v[2], v[6]);
2839
+ sum[6] = vec_mergeh(v[3], v[7]);
2840
+ sum[7] = vec_mergel(v[3], v[7]);
2841
+
2842
+ kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2843
+ kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2844
+ kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2845
+ kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2846
+ kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2847
+ kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2848
+ kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2849
+ kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2850
+ }
2851
+
2852
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2853
+ Packet8us v[8], sum[8];
2854
+
2855
+ v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2856
+ v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2857
+ v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2858
+ v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2859
+ v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2860
+ v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2861
+ v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2862
+ v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2863
+ sum[0] = vec_mergeh(v[0], v[4]);
2864
+ sum[1] = vec_mergel(v[0], v[4]);
2865
+ sum[2] = vec_mergeh(v[1], v[5]);
2866
+ sum[3] = vec_mergel(v[1], v[5]);
2867
+ sum[4] = vec_mergeh(v[2], v[6]);
2868
+ sum[5] = vec_mergel(v[2], v[6]);
2869
+ sum[6] = vec_mergeh(v[3], v[7]);
2870
+ sum[7] = vec_mergel(v[3], v[7]);
2871
+
2872
+ kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2873
+ kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2874
+ kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2875
+ kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2876
+ kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2877
+ kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2878
+ kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2879
+ kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2880
+ }
2881
+
2882
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2883
+ Packet8bf v[8], sum[8];
2884
+
2885
+ v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2886
+ v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2887
+ v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2888
+ v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2889
+ v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2890
+ v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2891
+ v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2892
+ v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2893
+ sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2894
+ sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2895
+ sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2896
+ sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2897
+ sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2898
+ sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2899
+ sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2900
+ sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2901
+
2902
+ kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2903
+ kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2904
+ kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2905
+ kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2906
+ kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2907
+ kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2908
+ kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2909
+ kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2910
+ }
2911
+
2912
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2913
+ Packet16c step1[16], step2[16], step3[16];
2914
+
2915
+ step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2916
+ step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2917
+ step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2918
+ step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2919
+ step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2920
+ step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2921
+ step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2922
+ step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2923
+ step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2924
+ step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2925
+ step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2926
+ step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2927
+ step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2928
+ step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2929
+ step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2930
+ step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2931
+
2932
+ step2[0] = vec_mergeh(step1[0], step1[8]);
2933
+ step2[1] = vec_mergel(step1[0], step1[8]);
2934
+ step2[2] = vec_mergeh(step1[1], step1[9]);
2935
+ step2[3] = vec_mergel(step1[1], step1[9]);
2936
+ step2[4] = vec_mergeh(step1[2], step1[10]);
2937
+ step2[5] = vec_mergel(step1[2], step1[10]);
2938
+ step2[6] = vec_mergeh(step1[3], step1[11]);
2939
+ step2[7] = vec_mergel(step1[3], step1[11]);
2940
+ step2[8] = vec_mergeh(step1[4], step1[12]);
2941
+ step2[9] = vec_mergel(step1[4], step1[12]);
2942
+ step2[10] = vec_mergeh(step1[5], step1[13]);
2943
+ step2[11] = vec_mergel(step1[5], step1[13]);
2944
+ step2[12] = vec_mergeh(step1[6], step1[14]);
2945
+ step2[13] = vec_mergel(step1[6], step1[14]);
2946
+ step2[14] = vec_mergeh(step1[7], step1[15]);
2947
+ step2[15] = vec_mergel(step1[7], step1[15]);
2948
+
2949
+ step3[0] = vec_mergeh(step2[0], step2[8]);
2950
+ step3[1] = vec_mergel(step2[0], step2[8]);
2951
+ step3[2] = vec_mergeh(step2[1], step2[9]);
2952
+ step3[3] = vec_mergel(step2[1], step2[9]);
2953
+ step3[4] = vec_mergeh(step2[2], step2[10]);
2954
+ step3[5] = vec_mergel(step2[2], step2[10]);
2955
+ step3[6] = vec_mergeh(step2[3], step2[11]);
2956
+ step3[7] = vec_mergel(step2[3], step2[11]);
2957
+ step3[8] = vec_mergeh(step2[4], step2[12]);
2958
+ step3[9] = vec_mergel(step2[4], step2[12]);
2959
+ step3[10] = vec_mergeh(step2[5], step2[13]);
2960
+ step3[11] = vec_mergel(step2[5], step2[13]);
2961
+ step3[12] = vec_mergeh(step2[6], step2[14]);
2962
+ step3[13] = vec_mergel(step2[6], step2[14]);
2963
+ step3[14] = vec_mergeh(step2[7], step2[15]);
2964
+ step3[15] = vec_mergel(step2[7], step2[15]);
2965
+
2966
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2967
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2968
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2969
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2970
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2971
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2972
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2973
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2974
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2975
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2976
+ kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2977
+ kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2978
+ kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2979
+ kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2980
+ kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2981
+ kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2982
+ }
2983
+
2984
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2985
+ Packet16uc step1[16], step2[16], step3[16];
2986
+
2987
+ step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2988
+ step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2989
+ step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2990
+ step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2991
+ step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2992
+ step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2993
+ step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2994
+ step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2995
+ step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2996
+ step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2997
+ step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2998
+ step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2999
+ step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
3000
+ step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
3001
+ step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
3002
+ step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
3003
+
3004
+ step2[0] = vec_mergeh(step1[0], step1[8]);
3005
+ step2[1] = vec_mergel(step1[0], step1[8]);
3006
+ step2[2] = vec_mergeh(step1[1], step1[9]);
3007
+ step2[3] = vec_mergel(step1[1], step1[9]);
3008
+ step2[4] = vec_mergeh(step1[2], step1[10]);
3009
+ step2[5] = vec_mergel(step1[2], step1[10]);
3010
+ step2[6] = vec_mergeh(step1[3], step1[11]);
3011
+ step2[7] = vec_mergel(step1[3], step1[11]);
3012
+ step2[8] = vec_mergeh(step1[4], step1[12]);
3013
+ step2[9] = vec_mergel(step1[4], step1[12]);
3014
+ step2[10] = vec_mergeh(step1[5], step1[13]);
3015
+ step2[11] = vec_mergel(step1[5], step1[13]);
3016
+ step2[12] = vec_mergeh(step1[6], step1[14]);
3017
+ step2[13] = vec_mergel(step1[6], step1[14]);
3018
+ step2[14] = vec_mergeh(step1[7], step1[15]);
3019
+ step2[15] = vec_mergel(step1[7], step1[15]);
3020
+
3021
+ step3[0] = vec_mergeh(step2[0], step2[8]);
3022
+ step3[1] = vec_mergel(step2[0], step2[8]);
3023
+ step3[2] = vec_mergeh(step2[1], step2[9]);
3024
+ step3[3] = vec_mergel(step2[1], step2[9]);
3025
+ step3[4] = vec_mergeh(step2[2], step2[10]);
3026
+ step3[5] = vec_mergel(step2[2], step2[10]);
3027
+ step3[6] = vec_mergeh(step2[3], step2[11]);
3028
+ step3[7] = vec_mergel(step2[3], step2[11]);
3029
+ step3[8] = vec_mergeh(step2[4], step2[12]);
3030
+ step3[9] = vec_mergel(step2[4], step2[12]);
3031
+ step3[10] = vec_mergeh(step2[5], step2[13]);
3032
+ step3[11] = vec_mergel(step2[5], step2[13]);
3033
+ step3[12] = vec_mergeh(step2[6], step2[14]);
3034
+ step3[13] = vec_mergel(step2[6], step2[14]);
3035
+ step3[14] = vec_mergeh(step2[7], step2[15]);
3036
+ step3[15] = vec_mergel(step2[7], step2[15]);
3037
+
3038
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3039
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3040
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3041
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3042
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3043
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3044
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3045
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3046
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3047
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3048
+ kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
3049
+ kernel.packet[11] = vec_mergel(step3[5], step3[13]);
3050
+ kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
3051
+ kernel.packet[13] = vec_mergel(step3[6], step3[14]);
3052
+ kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
3053
+ kernel.packet[15] = vec_mergel(step3[7], step3[15]);
3054
+ }
3055
+
3056
+ template <typename Packet>
3057
+ EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
3058
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
3059
+ Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
3060
+ return vec_sel(elsePacket, thenPacket, mask);
3061
+ }
3062
+
3063
+ template <>
3064
+ EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
3065
+ const Packet4i& elsePacket) {
3066
+ return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
3067
+ }
3068
+
3069
+ template <>
3070
+ EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
3071
+ const Packet4f& elsePacket) {
3072
+ return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
3073
+ }
3074
+
3075
+ template <>
3076
+ EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
3077
+ const Packet8s& elsePacket) {
3078
+ Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3079
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3080
+ Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
3081
+ Packet8s result = vec_sel(elsePacket, thenPacket, mask);
3082
+ return result;
3083
+ }
3084
+
3085
+ template <>
3086
+ EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
3087
+ const Packet8us& elsePacket) {
3088
+ Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3089
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3090
+ Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
762
3091
  return vec_sel(elsePacket, thenPacket, mask);
763
3092
  }
764
3093
 
765
- template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
766
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
767
- Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
3094
+ template <>
3095
+ EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
3096
+ const Packet8bf& elsePacket) {
3097
+ return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
3098
+ }
3099
+
3100
+ template <>
3101
+ EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
3102
+ const Packet16c& elsePacket) {
3103
+ Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3104
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3105
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3106
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3107
+
3108
+ Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
768
3109
  return vec_sel(elsePacket, thenPacket, mask);
769
3110
  }
770
3111
 
3112
+ template <>
3113
+ EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
3114
+ const Packet16uc& elsePacket) {
3115
+ Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3116
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3117
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3118
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3119
+
3120
+ Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
3121
+ return vec_sel(elsePacket, thenPacket, mask);
3122
+ }
771
3123
 
772
3124
  //---------- double ----------
773
- #ifdef __VSX__
774
- typedef __vector double Packet2d;
775
- typedef __vector unsigned long long Packet2ul;
776
- typedef __vector long long Packet2l;
3125
+ #ifdef EIGEN_VECTORIZE_VSX
3126
+ typedef __vector double Packet2d;
3127
+ typedef __vector unsigned long long Packet2ul;
3128
+ typedef __vector long long Packet2l;
777
3129
  #if EIGEN_COMP_CLANG
778
- typedef Packet2ul Packet2bl;
3130
+ typedef Packet2ul Packet2bl;
779
3131
  #else
780
- typedef __vector __bool long Packet2bl;
3132
+ typedef __vector __bool long Packet2bl;
781
3133
  #endif
782
3134
 
783
- static Packet2l p2l_ONE = { 1, 1 };
784
- static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
785
- static Packet2d p2d_ONE = { 1.0, 1.0 };
786
- static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
787
- static Packet2d p2d_MZERO = { -0.0, -0.0 };
3135
+ static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
3136
+ static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3137
+ static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3138
+ static Packet2d p2d_ONE = {1.0, 1.0};
3139
+ static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
3140
+ static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3141
+ numext::bit_cast<double>(0x8000000000000000ull)};
788
3142
 
789
3143
  #ifdef _BIG_ENDIAN
790
- static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
3144
+ static Packet2d p2d_COUNTDOWN =
3145
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
791
3146
  #else
792
- static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
3147
+ static Packet2d p2d_COUNTDOWN =
3148
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
793
3149
  #endif
794
3150
 
795
- template<int index> Packet2d vec_splat_dbl(Packet2d& a);
796
-
797
- template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
798
- {
799
- return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
800
- }
801
-
802
- template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
803
- {
804
- return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
3151
+ template <int index>
3152
+ Packet2d vec_splat_dbl(Packet2d& a) {
3153
+ return vec_splat(a, index);
805
3154
  }
806
3155
 
807
- template<> struct packet_traits<double> : default_packet_traits
808
- {
3156
+ template <>
3157
+ struct packet_traits<double> : default_packet_traits {
809
3158
  typedef Packet2d type;
810
3159
  typedef Packet2d half;
811
3160
  enum {
812
3161
  Vectorizable = 1,
813
3162
  AlignedOnScalar = 1,
814
- size=2,
815
- HasHalfPacket = 1,
816
-
817
- HasAdd = 1,
818
- HasSub = 1,
819
- HasMul = 1,
820
- HasDiv = 1,
821
- HasMin = 1,
822
- HasMax = 1,
823
- HasAbs = 1,
824
- HasSin = 0,
825
- HasCos = 0,
826
- HasLog = 0,
827
- HasExp = 1,
3163
+ size = 2,
3164
+
3165
+ HasAdd = 1,
3166
+ HasSub = 1,
3167
+ HasMul = 1,
3168
+ HasDiv = 1,
3169
+ HasMin = 1,
3170
+ HasMax = 1,
3171
+ HasAbs = 1,
3172
+ HasSin = EIGEN_FAST_MATH,
3173
+ HasCos = EIGEN_FAST_MATH,
3174
+ HasTanh = EIGEN_FAST_MATH,
3175
+ HasErf = EIGEN_FAST_MATH,
3176
+ HasErfc = EIGEN_FAST_MATH,
3177
+ HasATanh = 1,
3178
+ HasATan = 0,
3179
+ HasLog = 0,
3180
+ HasCmp = 1,
3181
+ HasExp = 1,
828
3182
  HasSqrt = 1,
3183
+ HasCbrt = 1,
3184
+ #if !EIGEN_COMP_CLANG
829
3185
  HasRsqrt = 1,
830
- HasRound = 1,
831
- HasFloor = 1,
832
- HasCeil = 1,
3186
+ #else
3187
+ HasRsqrt = 0,
3188
+ #endif
833
3189
  HasNegate = 1,
834
3190
  HasBlend = 1
835
3191
  };
836
3192
  };
837
3193
 
838
- template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
3194
+ template <>
3195
+ struct unpacket_traits<Packet2d> {
3196
+ typedef double type;
3197
+ typedef Packet2l integer_packet;
3198
+ enum {
3199
+ size = 2,
3200
+ alignment = Aligned16,
3201
+ vectorizable = true,
3202
+ masked_load_available = false,
3203
+ masked_store_available = false
3204
+ };
3205
+ typedef Packet2d half;
3206
+ };
3207
+ template <>
3208
+ struct unpacket_traits<Packet2l> {
3209
+ typedef int64_t type;
3210
+ typedef Packet2l half;
3211
+ enum {
3212
+ size = 2,
3213
+ alignment = Aligned16,
3214
+ vectorizable = false,
3215
+ masked_load_available = false,
3216
+ masked_store_available = false
3217
+ };
3218
+ };
839
3219
 
840
- inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
841
- {
3220
+ inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
842
3221
  union {
843
- Packet2l v;
3222
+ Packet2l v;
844
3223
  int64_t n[2];
845
3224
  } vt;
846
3225
  vt.v = v;
@@ -848,10 +3227,9 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
848
3227
  return s;
849
3228
  }
850
3229
 
851
- inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
852
- {
3230
+ inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
853
3231
  union {
854
- Packet2d v;
3232
+ Packet2d v;
855
3233
  double n[2];
856
3234
  } vt;
857
3235
  vt.v = v;
@@ -860,202 +3238,492 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
860
3238
  }
861
3239
 
862
3240
  // Need to define them first or we get specialization after instantiation errors
863
- template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
864
- {
3241
+ template <>
3242
+ EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
865
3243
  EIGEN_DEBUG_ALIGNED_LOAD
866
- #ifdef __VSX__
867
- return vec_vsx_ld(0, from);
868
- #else
869
- return vec_ld(0, from);
870
- #endif
3244
+ return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
3245
+ }
3246
+
3247
+ template <>
3248
+ EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3249
+ return pload_partial_common<Packet2d>(from, n, offset);
871
3250
  }
872
3251
 
873
- template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
874
- {
3252
+ template <>
3253
+ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
875
3254
  EIGEN_DEBUG_ALIGNED_STORE
876
- #ifdef __VSX__
877
- vec_vsx_st(from, 0, to);
878
- #else
879
- vec_st(from, 0, to);
880
- #endif
3255
+ vec_xst(from, 0, to);
3256
+ }
3257
+
3258
+ template <>
3259
+ EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3260
+ pstore_partial_common<Packet2d>(to, from, n, offset);
881
3261
  }
882
3262
 
883
- template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
3263
+ template <>
3264
+ EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
884
3265
  Packet2d v = {from, from};
885
3266
  return v;
886
3267
  }
3268
+ template <>
3269
+ EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
3270
+ Packet2l v = {from, from};
3271
+ return v;
3272
+ }
3273
+
3274
+ template <>
3275
+ EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
3276
+ Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
3277
+ return reinterpret_cast<Packet2d>(v);
3278
+ }
887
3279
 
888
- template<> EIGEN_STRONG_INLINE void
889
- pbroadcast4<Packet2d>(const double *a,
890
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
891
- {
892
- a1 = pload<Packet2d>(a);
893
- a0 = vec_splat_dbl<0>(a1);
894
- a1 = vec_splat_dbl<1>(a1);
895
- a3 = pload<Packet2d>(a+2);
896
- a2 = vec_splat_dbl<0>(a3);
897
- a3 = vec_splat_dbl<1>(a3);
3280
+ template <>
3281
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3282
+ Packet2d& a3) {
3283
+ // This way is faster than vec_splat (at least for doubles in Power 9)
3284
+ a0 = pset1<Packet2d>(a[0]);
3285
+ a1 = pset1<Packet2d>(a[1]);
3286
+ a2 = pset1<Packet2d>(a[2]);
3287
+ a3 = pset1<Packet2d>(a[3]);
898
3288
  }
899
3289
 
900
- template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
901
- {
902
- double EIGEN_ALIGN16 af[2];
903
- af[0] = from[0*stride];
904
- af[1] = from[1*stride];
905
- return pload<Packet2d>(af);
3290
+ template <>
3291
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
3292
+ return pgather_common<Packet2d>(from, stride);
3293
+ }
3294
+ template <>
3295
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
3296
+ const Index n) {
3297
+ return pgather_common<Packet2d>(from, stride, n);
906
3298
  }
907
- template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
908
- {
909
- double EIGEN_ALIGN16 af[2];
910
- pstore<double>(af, from);
911
- to[0*stride] = af[0];
912
- to[1*stride] = af[1];
3299
+ template <>
3300
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
3301
+ pscatter_common<Packet2d>(to, from, stride);
3302
+ }
3303
+ template <>
3304
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
3305
+ Index stride, const Index n) {
3306
+ pscatter_common<Packet2d>(to, from, stride, n);
913
3307
  }
914
3308
 
915
- template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
3309
+ template <>
3310
+ EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
3311
+ return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3312
+ }
916
3313
 
917
- template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
3314
+ template <>
3315
+ EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
3316
+ return a + b;
3317
+ }
918
3318
 
919
- template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
3319
+ template <>
3320
+ EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
3321
+ return a - b;
3322
+ }
920
3323
 
921
- template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
3324
+ template <>
3325
+ EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
3326
+ #ifdef __POWER8_VECTOR__
3327
+ return vec_neg(a);
3328
+ #else
3329
+ return vec_xor(a, p2d_MZERO);
3330
+ #endif
3331
+ }
922
3332
 
923
- template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
3333
+ template <>
3334
+ EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
3335
+ return a;
3336
+ }
924
3337
 
925
- template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
926
- template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
3338
+ template <>
3339
+ EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
3340
+ return vec_madd(a, b, p2d_MZERO);
3341
+ }
3342
+ template <>
3343
+ EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
3344
+ return vec_div(a, b);
3345
+ }
927
3346
 
928
3347
  // for some weird raisons, it has to be overloaded for packet of integers
929
- template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
3348
+ template <>
3349
+ EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3350
+ return vec_madd(a, b, c);
3351
+ }
3352
+ template <>
3353
+ EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3354
+ return vec_msub(a, b, c);
3355
+ }
3356
+ template <>
3357
+ EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3358
+ return vec_nmsub(a, b, c);
3359
+ }
3360
+ template <>
3361
+ EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3362
+ return vec_nmadd(a, b, c);
3363
+ }
930
3364
 
931
- template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
932
- {
3365
+ template <>
3366
+ EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
3367
+ // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
933
3368
  Packet2d ret;
934
- __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
3369
+ __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
935
3370
  return ret;
936
- }
3371
+ }
937
3372
 
938
- template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
939
- {
3373
+ template <>
3374
+ EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
3375
+ // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
940
3376
  Packet2d ret;
941
- __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
3377
+ __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
942
3378
  return ret;
943
3379
  }
944
3380
 
945
- template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
3381
+ template <>
3382
+ EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
3383
+ return reinterpret_cast<Packet2d>(vec_cmple(a, b));
3384
+ }
3385
+ template <>
3386
+ EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
3387
+ return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
3388
+ }
3389
+ template <>
3390
+ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
3391
+ return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
3392
+ }
3393
+ template <>
3394
+ #ifdef __POWER8_VECTOR__
3395
+ EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3396
+ return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
3397
+ }
3398
+ #else
3399
+ EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3400
+ Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
3401
+ Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3402
+ return reinterpret_cast<Packet2l>(pand(halves, flipped));
3403
+ }
3404
+ #endif
3405
+ template <>
3406
+ EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
3407
+ Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
3408
+ return vec_nor(c, c);
3409
+ }
3410
+
3411
+ template <>
3412
+ EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
3413
+ return vec_and(a, b);
3414
+ }
3415
+
3416
+ template <>
3417
+ EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
3418
+ return vec_or(a, b);
3419
+ }
946
3420
 
947
- template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
3421
+ template <>
3422
+ EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
3423
+ return vec_xor(a, b);
3424
+ }
948
3425
 
949
- template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
3426
+ template <>
3427
+ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
3428
+ return vec_and(a, vec_nor(b, b));
3429
+ }
950
3430
 
951
- template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
3431
+ template <>
3432
+ EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
3433
+ Packet2d t = vec_add(
3434
+ reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3435
+ Packet2d res;
952
3436
 
953
- template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
954
- template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
955
- template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
3437
+ __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
956
3438
 
957
- template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
958
- {
959
- EIGEN_DEBUG_ALIGNED_LOAD
960
- return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
3439
+ return res;
3440
+ }
3441
+ template <>
3442
+ EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
3443
+ return vec_ceil(a);
3444
+ }
3445
+ template <>
3446
+ EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
3447
+ return vec_floor(a);
3448
+ }
3449
+ template <>
3450
+ EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
3451
+ return vec_trunc(a);
961
3452
  }
3453
+ template <>
3454
+ EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
3455
+ Packet2d res;
962
3456
 
963
- template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
964
- {
965
- Packet2d p;
966
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
967
- else p = ploadu<Packet2d>(from);
968
- return vec_splat_dbl<0>(p);
3457
+ __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
3458
+
3459
+ return res;
969
3460
  }
970
3461
 
971
- template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
972
- {
973
- EIGEN_DEBUG_ALIGNED_STORE
974
- vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
3462
+ template <>
3463
+ EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
3464
+ EIGEN_DEBUG_UNALIGNED_LOAD
3465
+ return vec_xl(0, const_cast<double*>(from));
3466
+ }
3467
+
3468
+ template <>
3469
+ EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3470
+ return ploadu_partial_common<Packet2d>(from, n, offset);
975
3471
  }
976
3472
 
977
- template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
3473
+ template <>
3474
+ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
3475
+ Packet2d p;
3476
+ if ((std::ptrdiff_t(from) % 16) == 0)
3477
+ p = pload<Packet2d>(from);
3478
+ else
3479
+ p = ploadu<Packet2d>(from);
3480
+ return vec_splat_dbl<0>(p);
3481
+ }
978
3482
 
979
- template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
3483
+ template <>
3484
+ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
3485
+ EIGEN_DEBUG_UNALIGNED_STORE
3486
+ vec_xst(from, 0, to);
3487
+ }
980
3488
 
981
- template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
982
- {
983
- return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
3489
+ template <>
3490
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3491
+ pstoreu_partial_common<Packet2d>(to, from, n, offset);
984
3492
  }
985
- template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
986
3493
 
987
- template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
988
- {
989
- Packet2d b, sum;
990
- b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
991
- sum = a + b;
992
- return pfirst<Packet2d>(sum);
3494
+ template <>
3495
+ EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
3496
+ EIGEN_PPC_PREFETCH(addr);
993
3497
  }
994
3498
 
995
- template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
996
- {
997
- Packet2d v[2], sum;
998
- v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
999
- v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
3499
+ template <>
3500
+ EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
3501
+ EIGEN_ALIGN16 double x[2];
3502
+ pstore<double>(x, a);
3503
+ return x[0];
3504
+ }
1000
3505
 
3506
+ template <>
3507
+ EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
3508
+ return vec_sld(a, a, 8);
3509
+ }
3510
+ template <>
3511
+ EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
3512
+ return vec_abs(a);
3513
+ }
3514
+ #ifdef __POWER8_VECTOR__
3515
+ template <>
3516
+ EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3517
+ return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
3518
+ }
3519
+ #else
1001
3520
  #ifdef _BIG_ENDIAN
1002
- sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
3521
+ static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
1003
3522
  #else
1004
- sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
3523
+ static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
1005
3524
  #endif
1006
3525
 
1007
- return sum;
3526
+ template <>
3527
+ EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3528
+ Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
3529
+ return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
1008
3530
  }
1009
- // Other reduction functions:
1010
- // mul
1011
- template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
1012
- {
1013
- return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3531
+ #endif
3532
+
3533
+ template <>
3534
+ inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
3535
+
3536
+ template <>
3537
+ inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
3538
+
3539
+ // Packet2l shifts.
3540
+ // For POWER8 we simply use vec_sr/l.
3541
+ //
3542
+ // Things are more complicated for POWER7. There is actually a
3543
+ // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
3544
+ // So we need to shift by N % 32 and rearrage bytes.
3545
+ #ifdef __POWER8_VECTOR__
3546
+
3547
+ template <int N>
3548
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3549
+ const Packet2ul shift = {N, N};
3550
+ return vec_sl(a, shift);
1014
3551
  }
1015
3552
 
1016
- // min
1017
- template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
1018
- {
1019
- return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3553
+ template <int N>
3554
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3555
+ const Packet2ul shift = {N, N};
3556
+ return vec_sr(a, shift);
1020
3557
  }
1021
3558
 
1022
- // max
1023
- template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
1024
- {
1025
- return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3559
+ #else
3560
+
3561
+ // Shifts [A, B, C, D] to [B, 0, D, 0].
3562
+ // Used to implement left shifts for Packet2l.
3563
+ EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
3564
+ static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3565
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3566
+ #ifdef _BIG_ENDIAN
3567
+ return vec_perm(p4i_ZERO, a, perm);
3568
+ #else
3569
+ return vec_perm(a, p4i_ZERO, perm);
3570
+ #endif
1026
3571
  }
1027
3572
 
1028
- template<int Offset>
1029
- struct palign_impl<Offset,Packet2d>
1030
- {
1031
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
1032
- {
1033
- if (Offset == 1)
3573
+ // Shifts [A, B, C, D] to [0, A, 0, C].
3574
+ // Used to implement right shifts for Packet2l.
3575
+ EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
3576
+ static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3577
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
1034
3578
  #ifdef _BIG_ENDIAN
1035
- first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
3579
+ return vec_perm(p4i_ZERO, a, perm);
1036
3580
  #else
1037
- first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
3581
+ return vec_perm(a, p4i_ZERO, perm);
1038
3582
  #endif
3583
+ }
3584
+
3585
+ template <int N, typename EnableIf = void>
3586
+ struct plogical_shift_left_impl;
3587
+
3588
+ template <int N>
3589
+ struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3590
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3591
+ static const unsigned n = static_cast<unsigned>(N);
3592
+ const Packet4ui shift = {n, n, n, n};
3593
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
3594
+ static const unsigned m = static_cast<unsigned>(32 - N);
3595
+ const Packet4ui shift_right = {m, m, m, m};
3596
+ const Packet4i out_hi = vec_sl(ai, shift);
3597
+ const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3598
+ return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3599
+ }
3600
+ };
3601
+
3602
+ template <int N>
3603
+ struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
3604
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3605
+ static const unsigned m = static_cast<unsigned>(N - 32);
3606
+ const Packet4ui shift = {m, m, m, m};
3607
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
3608
+ return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
1039
3609
  }
1040
3610
  };
1041
3611
 
1042
- EIGEN_DEVICE_FUNC inline void
1043
- ptranspose(PacketBlock<Packet2d,2>& kernel) {
3612
+ template <int N>
3613
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3614
+ return plogical_shift_left_impl<N>::run(a);
3615
+ }
3616
+
3617
+ template <int N, typename EnableIf = void>
3618
+ struct plogical_shift_right_impl;
3619
+
3620
+ template <int N>
3621
+ struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3622
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3623
+ static const unsigned n = static_cast<unsigned>(N);
3624
+ const Packet4ui shift = {n, n, n, n};
3625
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
3626
+ static const unsigned m = static_cast<unsigned>(32 - N);
3627
+ const Packet4ui shift_left = {m, m, m, m};
3628
+ const Packet4i out_lo = vec_sr(ai, shift);
3629
+ const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3630
+ return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3631
+ }
3632
+ };
3633
+
3634
+ template <int N>
3635
+ struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
3636
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3637
+ static const unsigned m = static_cast<unsigned>(N - 32);
3638
+ const Packet4ui shift = {m, m, m, m};
3639
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
3640
+ return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
3641
+ }
3642
+ };
3643
+
3644
+ template <int N>
3645
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3646
+ return plogical_shift_right_impl<N>::run(a);
3647
+ }
3648
+ #endif
3649
+
3650
+ template <>
3651
+ EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
3652
+ // Clamp exponent to [-2099, 2099]
3653
+ const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3654
+ const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3655
+
3656
+ // Split 2^e into four factors and multiply:
3657
+ const Packet2l bias = {1023, 1023};
3658
+ Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
3659
+ Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
3660
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
3661
+ b = psub(psub(psub(e, b), b), b); // e - 3b
3662
+ c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
3663
+ out = pmul(out, c); // a * 2^e
3664
+ return out;
3665
+ }
3666
+
3667
+ // Extract exponent without existence of Packet2l.
3668
+ template <>
3669
+ EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
3670
+ return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
3671
+ }
3672
+
3673
+ template <>
3674
+ EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
3675
+ return pfrexp_generic(a, exponent);
3676
+ }
3677
+
3678
+ template <>
3679
+ EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
3680
+ Packet2d b, sum;
3681
+ b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
3682
+ sum = a + b;
3683
+ return pfirst<Packet2d>(sum);
3684
+ }
3685
+
3686
+ // Other reduction functions:
3687
+ // mul
3688
+ template <>
3689
+ EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
3690
+ return pfirst(
3691
+ pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3692
+ }
3693
+
3694
+ // min
3695
+ template <>
3696
+ EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
3697
+ return pfirst(
3698
+ pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3699
+ }
3700
+
3701
+ // max
3702
+ template <>
3703
+ EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
3704
+ return pfirst(
3705
+ pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3706
+ }
3707
+
3708
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1044
3709
  Packet2d t0, t1;
1045
- t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
1046
- t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
3710
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3711
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
1047
3712
  kernel.packet[0] = t0;
1048
3713
  kernel.packet[1] = t1;
1049
3714
  }
1050
3715
 
1051
- template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
1052
- Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
1053
- Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
3716
+ template <>
3717
+ EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
3718
+ const Packet2d& elsePacket) {
3719
+ Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
3720
+ Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
1054
3721
  return vec_sel(elsePacket, thenPacket, mask);
1055
3722
  }
1056
- #endif // __VSX__
1057
- } // end namespace internal
1058
3723
 
1059
- } // end namespace Eigen
3724
+ #endif // __VSX__
3725
+ } // end namespace internal
3726
+
3727
+ } // end namespace Eigen
1060
3728
 
1061
- #endif // EIGEN_PACKET_MATH_ALTIVEC_H
3729
+ #endif // EIGEN_PACKET_MATH_ALTIVEC_H