@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -10,6 +10,9 @@
10
10
  #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11
11
  #define EIGEN_PACKET_MATH_ALTIVEC_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
15
+
13
16
  namespace Eigen {
14
17
 
15
18
  namespace internal {
@@ -24,127 +27,137 @@ namespace internal {
24
27
 
25
28
  // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
26
29
  #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
27
- #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28
31
  #endif
29
32
 
30
- typedef __vector float Packet4f;
31
- typedef __vector int Packet4i;
32
- typedef __vector unsigned int Packet4ui;
33
- typedef __vector __bool int Packet4bi;
34
- typedef __vector short int Packet8s;
35
- typedef __vector unsigned short int Packet8us;
36
- typedef __vector signed char Packet16c;
37
- typedef __vector unsigned char Packet16uc;
38
- typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
33
+ typedef __vector float Packet4f;
34
+ typedef __vector int Packet4i;
35
+ typedef __vector unsigned int Packet4ui;
36
+ typedef __vector __bool int Packet4bi;
37
+ typedef __vector short int Packet8s;
38
+ typedef __vector unsigned short int Packet8us;
39
+ typedef __vector __bool short Packet8bi;
40
+ typedef __vector signed char Packet16c;
41
+ typedef __vector unsigned char Packet16uc;
42
+ typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
39
43
 
40
44
  // We don't want to write the same code all the time, but we need to reuse the constants
41
45
  // and it doesn't really work to declare them global, so we define macros instead
42
- #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
43
- Packet4f p4f_##NAME = {X, X, X, X}
46
+ #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
44
47
 
45
- #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
46
- Packet4i p4i_##NAME = vec_splat_s32(X)
48
+ #define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
47
49
 
48
- #define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
49
- Packet4ui p4ui_##NAME = {X, X, X, X}
50
+ #define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
50
51
 
51
- #define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
52
- Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
52
+ #define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53
53
 
54
- #define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
54
+ #define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55
55
  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56
56
 
57
- #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
58
- Packet4f p4f_##NAME = pset1<Packet4f>(X)
57
+ #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
59
58
 
60
- #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
61
- Packet4i p4i_##NAME = pset1<Packet4i>(X)
59
+ #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
62
60
 
63
- #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
64
- Packet2d p2d_##NAME = pset1<Packet2d>(X)
61
+ #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
65
62
 
66
- #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
67
- Packet2l p2l_##NAME = pset1<Packet2l>(X)
63
+ #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
68
64
 
69
- #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
65
+ #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
70
66
  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
71
67
 
72
68
  #define DST_CHAN 1
73
69
  #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
74
- #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
70
+ #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
75
71
 
76
72
  // These constants are endian-agnostic
77
- static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
78
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
79
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
80
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
81
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
82
- static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
83
- static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
84
- static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
85
- static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
86
- static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
73
+ static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
74
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
75
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
76
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
77
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
78
+ static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79
+ static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80
+ static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
81
+ static Packet4f p4f_MZERO =
82
+ (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
87
83
  #ifndef __VSX__
88
- static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
84
+ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
89
85
  #endif
90
86
 
91
- static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
92
- static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
93
- static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
94
- static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
87
+ static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88
+ static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89
+ static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90
+ static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
91
+
92
+ static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93
+ static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
95
94
 
96
- static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
97
- 8, 9, 10, 11, 12, 13, 14, 15};
98
- static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
99
- 8, 9, 10, 11, 12, 13, 14, 15};
95
+ static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96
+ static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97
+ static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
100
98
 
101
- static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
102
- static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
103
- static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
99
+ #ifdef _BIG_ENDIAN
100
+ static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
101
+ #endif
102
+ static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103
+ static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
104
104
 
105
- static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
106
- static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
107
- static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
108
- static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
109
- static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
105
+ static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106
+ static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
110
107
 
111
- static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
108
+ static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109
+ static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
110
+ #ifdef _BIG_ENDIAN
111
+ static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
112
+ #else
113
+ static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
114
+ #endif
112
115
 
113
116
  // Handle endianness properly while loading constants
114
117
  // Define global static constants:
115
118
  #ifdef _BIG_ENDIAN
116
119
  static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
117
- #ifdef __VSX__
118
- static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
119
- #endif
120
- static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
121
- static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
122
- static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
120
+ static Packet16uc p16uc_PSET32_WODD =
121
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
122
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
123
+ static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
124
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
125
+ static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
126
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
123
127
  #else
124
128
  static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
125
- static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
126
- static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
127
- static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
128
- static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
129
- #endif // _BIG_ENDIAN
129
+ static Packet16uc p16uc_PSET32_WODD =
130
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
132
+ static Packet16uc p16uc_PSET32_WEVEN =
133
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
134
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
135
+ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
136
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
137
+ #endif // _BIG_ENDIAN
138
+
139
+ static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
141
+ static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
143
+ static Packet16uc p16uc_TRANSPOSE64_HI =
144
+ p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
145
+ static Packet16uc p16uc_TRANSPOSE64_LO =
146
+ p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
147
+
148
+ static Packet16uc p16uc_COMPLEX32_REV =
149
+ vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
130
150
 
131
- static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
132
- static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
133
- static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
134
- static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
135
-
136
- static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
137
-
138
- #ifdef _BIG_ENDIAN
139
- static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
151
+ #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152
+ #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
140
153
  #else
141
- static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
142
- #endif // _BIG_ENDIAN
154
+ #define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
155
+ #endif
143
156
 
144
- #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
145
- #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
157
+ #if EIGEN_COMP_LLVM
158
+ #define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
146
159
  #else
147
- #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
160
+ #define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
148
161
  #endif
149
162
 
150
163
  template <>
@@ -155,7 +168,6 @@ struct packet_traits<float> : default_packet_traits {
155
168
  Vectorizable = 1,
156
169
  AlignedOnScalar = 1,
157
170
  size = 4,
158
- HasHalfPacket = 1,
159
171
 
160
172
  HasAdd = 1,
161
173
  HasSub = 1,
@@ -166,25 +178,31 @@ struct packet_traits<float> : default_packet_traits {
166
178
  HasAbs = 1,
167
179
  HasSin = EIGEN_FAST_MATH,
168
180
  HasCos = EIGEN_FAST_MATH,
181
+ HasACos = 1,
182
+ HasASin = 1,
183
+ HasATan = 1,
184
+ HasATanh = 1,
169
185
  HasLog = 1,
170
186
  HasExp = 1,
171
- #ifdef __VSX__
187
+ #ifdef EIGEN_VECTORIZE_VSX
188
+ HasCmp = 1,
189
+ HasPow = 1,
172
190
  HasSqrt = 1,
191
+ HasCbrt = 1,
173
192
  #if !EIGEN_COMP_CLANG
174
193
  HasRsqrt = 1,
175
194
  #else
176
195
  HasRsqrt = 0,
177
196
  #endif
197
+ HasTanh = EIGEN_FAST_MATH,
198
+ HasErf = EIGEN_FAST_MATH,
199
+ HasErfc = EIGEN_FAST_MATH,
178
200
  #else
179
201
  HasSqrt = 0,
180
202
  HasRsqrt = 0,
181
- HasTanh = EIGEN_FAST_MATH,
182
- HasErf = EIGEN_FAST_MATH,
203
+ HasTanh = 0,
204
+ HasErf = 0,
183
205
  #endif
184
- HasRound = 1,
185
- HasFloor = 1,
186
- HasCeil = 1,
187
- HasRint = 1,
188
206
  HasNegate = 1,
189
207
  HasBlend = 1
190
208
  };
@@ -197,7 +215,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
197
215
  Vectorizable = 1,
198
216
  AlignedOnScalar = 1,
199
217
  size = 8,
200
- HasHalfPacket = 0,
201
218
 
202
219
  HasAdd = 1,
203
220
  HasSub = 1,
@@ -210,7 +227,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
210
227
  HasCos = EIGEN_FAST_MATH,
211
228
  HasLog = 1,
212
229
  HasExp = 1,
213
- #ifdef __VSX__
230
+ #ifdef EIGEN_VECTORIZE_VSX
214
231
  HasSqrt = 1,
215
232
  #if !EIGEN_COMP_CLANG
216
233
  HasRsqrt = 1,
@@ -220,13 +237,9 @@ struct packet_traits<bfloat16> : default_packet_traits {
220
237
  #else
221
238
  HasSqrt = 0,
222
239
  HasRsqrt = 0,
223
- HasTanh = EIGEN_FAST_MATH,
224
- HasErf = EIGEN_FAST_MATH,
225
240
  #endif
226
- HasRound = 1,
227
- HasFloor = 1,
228
- HasCeil = 1,
229
- HasRint = 1,
241
+ HasTanh = 0,
242
+ HasErf = 0,
230
243
  HasNegate = 1,
231
244
  HasBlend = 1
232
245
  };
@@ -240,14 +253,18 @@ struct packet_traits<int> : default_packet_traits {
240
253
  Vectorizable = 1,
241
254
  AlignedOnScalar = 1,
242
255
  size = 4,
243
- HasHalfPacket = 0,
244
256
 
245
- HasAdd = 1,
246
- HasSub = 1,
257
+ HasAdd = 1,
258
+ HasSub = 1,
247
259
  HasShift = 1,
248
- HasMul = 1,
249
- HasDiv = 0,
250
- HasBlend = 1
260
+ HasMul = 1,
261
+ #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
262
+ HasDiv = 1,
263
+ #else
264
+ HasDiv = 0,
265
+ #endif
266
+ HasBlend = 1,
267
+ HasCmp = 1
251
268
  };
252
269
  };
253
270
 
@@ -259,13 +276,13 @@ struct packet_traits<short int> : default_packet_traits {
259
276
  Vectorizable = 1,
260
277
  AlignedOnScalar = 1,
261
278
  size = 8,
262
- HasHalfPacket = 0,
263
279
 
264
- HasAdd = 1,
265
- HasSub = 1,
266
- HasMul = 1,
267
- HasDiv = 0,
268
- HasBlend = 1
280
+ HasAdd = 1,
281
+ HasSub = 1,
282
+ HasMul = 1,
283
+ HasDiv = 0,
284
+ HasBlend = 1,
285
+ HasCmp = 1
269
286
  };
270
287
  };
271
288
 
@@ -277,13 +294,13 @@ struct packet_traits<unsigned short int> : default_packet_traits {
277
294
  Vectorizable = 1,
278
295
  AlignedOnScalar = 1,
279
296
  size = 8,
280
- HasHalfPacket = 0,
281
297
 
282
- HasAdd = 1,
283
- HasSub = 1,
284
- HasMul = 1,
285
- HasDiv = 0,
286
- HasBlend = 1
298
+ HasAdd = 1,
299
+ HasSub = 1,
300
+ HasMul = 1,
301
+ HasDiv = 0,
302
+ HasBlend = 1,
303
+ HasCmp = 1
287
304
  };
288
305
  };
289
306
 
@@ -295,13 +312,13 @@ struct packet_traits<signed char> : default_packet_traits {
295
312
  Vectorizable = 1,
296
313
  AlignedOnScalar = 1,
297
314
  size = 16,
298
- HasHalfPacket = 0,
299
315
 
300
- HasAdd = 1,
301
- HasSub = 1,
302
- HasMul = 1,
303
- HasDiv = 0,
304
- HasBlend = 1
316
+ HasAdd = 1,
317
+ HasSub = 1,
318
+ HasMul = 1,
319
+ HasDiv = 0,
320
+ HasBlend = 1,
321
+ HasCmp = 1
305
322
  };
306
323
  };
307
324
 
@@ -313,273 +330,442 @@ struct packet_traits<unsigned char> : default_packet_traits {
313
330
  Vectorizable = 1,
314
331
  AlignedOnScalar = 1,
315
332
  size = 16,
316
- HasHalfPacket = 0,
317
333
 
318
- HasAdd = 1,
319
- HasSub = 1,
320
- HasMul = 1,
321
- HasDiv = 0,
322
- HasBlend = 1
334
+ HasAdd = 1,
335
+ HasSub = 1,
336
+ HasMul = 1,
337
+ HasDiv = 0,
338
+ HasBlend = 1,
339
+ HasCmp = 1
323
340
  };
324
341
  };
325
342
 
326
- template<> struct unpacket_traits<Packet4f>
327
- {
328
- typedef float type;
329
- typedef Packet4f half;
330
- typedef Packet4i integer_packet;
331
- enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
343
+ template <>
344
+ struct unpacket_traits<Packet4f> {
345
+ typedef float type;
346
+ typedef Packet4f half;
347
+ typedef Packet4i integer_packet;
348
+ enum {
349
+ size = 4,
350
+ alignment = Aligned16,
351
+ vectorizable = true,
352
+ masked_load_available = false,
353
+ masked_store_available = false
354
+ };
332
355
  };
333
- template<> struct unpacket_traits<Packet4i>
334
- {
335
- typedef int type;
336
- typedef Packet4i half;
337
- enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
356
+ template <>
357
+ struct unpacket_traits<Packet4i> {
358
+ typedef int type;
359
+ typedef Packet4i half;
360
+ enum {
361
+ size = 4,
362
+ alignment = Aligned16,
363
+ vectorizable = true,
364
+ masked_load_available = false,
365
+ masked_store_available = false
366
+ };
338
367
  };
339
- template<> struct unpacket_traits<Packet8s>
340
- {
368
+ template <>
369
+ struct unpacket_traits<Packet8s> {
341
370
  typedef short int type;
342
- typedef Packet8s half;
343
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
371
+ typedef Packet8s half;
372
+ enum {
373
+ size = 8,
374
+ alignment = Aligned16,
375
+ vectorizable = true,
376
+ masked_load_available = false,
377
+ masked_store_available = false
378
+ };
344
379
  };
345
- template<> struct unpacket_traits<Packet8us>
346
- {
380
+ template <>
381
+ struct unpacket_traits<Packet8us> {
347
382
  typedef unsigned short int type;
348
- typedef Packet8us half;
349
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
383
+ typedef Packet8us half;
384
+ enum {
385
+ size = 8,
386
+ alignment = Aligned16,
387
+ vectorizable = true,
388
+ masked_load_available = false,
389
+ masked_store_available = false
390
+ };
350
391
  };
351
392
 
352
- template<> struct unpacket_traits<Packet16c>
353
- {
393
+ template <>
394
+ struct unpacket_traits<Packet16c> {
354
395
  typedef signed char type;
355
- typedef Packet16c half;
356
- enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
396
+ typedef Packet16c half;
397
+ enum {
398
+ size = 16,
399
+ alignment = Aligned16,
400
+ vectorizable = true,
401
+ masked_load_available = false,
402
+ masked_store_available = false
403
+ };
357
404
  };
358
- template<> struct unpacket_traits<Packet16uc>
359
- {
405
+ template <>
406
+ struct unpacket_traits<Packet16uc> {
360
407
  typedef unsigned char type;
361
- typedef Packet16uc half;
362
- enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
408
+ typedef Packet16uc half;
409
+ enum {
410
+ size = 16,
411
+ alignment = Aligned16,
412
+ vectorizable = true,
413
+ masked_load_available = false,
414
+ masked_store_available = false
415
+ };
363
416
  };
364
417
 
365
- template<> struct unpacket_traits<Packet8bf>
366
- {
418
+ template <>
419
+ struct unpacket_traits<Packet8bf> {
367
420
  typedef bfloat16 type;
368
- typedef Packet8bf half;
369
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
421
+ typedef Packet8bf half;
422
+ enum {
423
+ size = 8,
424
+ alignment = Aligned16,
425
+ vectorizable = true,
426
+ masked_load_available = false,
427
+ masked_store_available = false
428
+ };
370
429
  };
371
- inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
372
- {
373
- union {
374
- Packet16c v;
375
- signed char n[16];
376
- } vt;
377
- vt.v = v;
378
- for (int i=0; i< 16; i++)
379
- s << vt.n[i] << ", ";
380
- return s;
430
+
431
+ template <typename Packet>
432
+ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
433
+ // some versions of GCC throw "unused-but-set-parameter".
434
+ // ignoring these warnings for now.
435
+ EIGEN_UNUSED_VARIABLE(from);
436
+ EIGEN_DEBUG_ALIGNED_LOAD
437
+ #ifdef EIGEN_VECTORIZE_VSX
438
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
439
+ #else
440
+ return vec_ld(0, from);
441
+ #endif
381
442
  }
382
443
 
383
- inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
384
- {
385
- union {
386
- Packet16uc v;
387
- unsigned char n[16];
388
- } vt;
389
- vt.v = v;
390
- for (int i=0; i< 16; i++)
391
- s << vt.n[i] << ", ";
392
- return s;
444
+ // Need to define them first or we get specialization after instantiation errors
445
+ template <>
446
+ EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
447
+ return pload_common<Packet4f>(from);
393
448
  }
394
449
 
395
- inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
396
- {
397
- union {
398
- Packet4f v;
399
- float n[4];
400
- } vt;
401
- vt.v = v;
402
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
403
- return s;
450
+ template <>
451
+ EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
452
+ return pload_common<Packet4i>(from);
404
453
  }
405
454
 
406
- inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
407
- {
408
- union {
409
- Packet4i v;
410
- int n[4];
411
- } vt;
412
- vt.v = v;
413
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
414
- return s;
455
+ template <>
456
+ EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
457
+ return pload_common<Packet8s>(from);
415
458
  }
416
459
 
417
- inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
418
- {
419
- union {
420
- Packet4ui v;
421
- unsigned int n[4];
422
- } vt;
423
- vt.v = v;
424
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
425
- return s;
460
+ template <>
461
+ EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
462
+ return pload_common<Packet8us>(from);
463
+ }
464
+
465
+ template <>
466
+ EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
467
+ return pload_common<Packet16c>(from);
468
+ }
469
+
470
+ template <>
471
+ EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
472
+ return pload_common<Packet16uc>(from);
473
+ }
474
+
475
+ template <>
476
+ EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
477
+ return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
426
478
  }
427
479
 
428
480
  template <typename Packet>
429
- EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
430
- {
481
+ EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
431
482
  // some versions of GCC throw "unused-but-set-parameter".
432
483
  // ignoring these warnings for now.
433
484
  EIGEN_UNUSED_VARIABLE(from);
434
485
  EIGEN_DEBUG_ALIGNED_LOAD
435
- #ifdef __VSX__
486
+ // Ignore partial input memory initialized
487
+ #if !EIGEN_COMP_LLVM
488
+ #pragma GCC diagnostic push
489
+ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
490
+ #endif
491
+ #ifdef EIGEN_VECTORIZE_VSX
436
492
  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
437
493
  #else
438
494
  return vec_ld(0, from);
439
495
  #endif
496
+ #if !EIGEN_COMP_LLVM
497
+ #pragma GCC diagnostic pop
498
+ #endif
440
499
  }
441
500
 
442
- // Need to define them first or we get specialization after instantiation errors
443
- template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
444
- {
445
- return pload_common<Packet4f>(from);
501
+ template <>
502
+ EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
503
+ return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
446
504
  }
447
505
 
448
- template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
449
- {
450
- return pload_common<Packet4i>(from);
506
+ template <typename Packet>
507
+ EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
508
+ const Index offset) {
509
+ // some versions of GCC throw "unused-but-set-parameter".
510
+ // ignoring these warnings for now.
511
+ const Index packet_size = unpacket_traits<Packet>::size;
512
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
513
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
514
+ #ifdef _ARCH_PWR9
515
+ EIGEN_UNUSED_VARIABLE(packet_size);
516
+ EIGEN_DEBUG_ALIGNED_LOAD
517
+ EIGEN_UNUSED_VARIABLE(from);
518
+ Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
519
+ if (offset) {
520
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
521
+ #ifdef _BIG_ENDIAN
522
+ load = Packet(vec_sro(Packet16uc(load), shift));
523
+ #else
524
+ load = Packet(vec_slo(Packet16uc(load), shift));
525
+ #endif
526
+ }
527
+ return load;
528
+ #else
529
+ if (n) {
530
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
531
+ unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
532
+ unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
533
+ Index n2 = n * size;
534
+ if (16 <= n2) {
535
+ pstoreu(load2, ploadu<Packet16uc>(from2));
536
+ } else {
537
+ memcpy((void*)load2, (void*)from2, n2);
538
+ }
539
+ return pload_ignore<Packet>(load);
540
+ } else {
541
+ return Packet(pset1<Packet16uc>(0));
542
+ }
543
+ #endif
451
544
  }
452
545
 
453
- template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
454
- {
455
- return pload_common<Packet8s>(from);
546
+ template <>
547
+ EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
548
+ return pload_partial_common<Packet4f>(from, n, offset);
456
549
  }
457
550
 
458
- template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
459
- {
460
- return pload_common<Packet8us>(from);
551
+ template <>
552
+ EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
553
+ return pload_partial_common<Packet4i>(from, n, offset);
461
554
  }
462
555
 
463
- template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from)
464
- {
465
- return pload_common<Packet16c>(from);
556
+ template <>
557
+ EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
558
+ return pload_partial_common<Packet8s>(from, n, offset);
466
559
  }
467
560
 
468
- template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from)
469
- {
470
- return pload_common<Packet16uc>(from);
561
+ template <>
562
+ EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
563
+ const Index offset) {
564
+ return pload_partial_common<Packet8us>(from, n, offset);
471
565
  }
472
566
 
473
- template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from)
474
- {
475
- return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
567
+ template <>
568
+ EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
569
+ return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
570
+ }
571
+
572
+ template <>
573
+ EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
574
+ return pload_partial_common<Packet16c>(from, n, offset);
575
+ }
576
+
577
+ template <>
578
+ EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
579
+ return pload_partial_common<Packet16uc>(from, n, offset);
476
580
  }
477
581
 
478
582
  template <typename Packet>
479
- EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
583
+ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
480
584
  // some versions of GCC throw "unused-but-set-parameter" (float *to).
481
585
  // ignoring these warnings for now.
482
586
  EIGEN_UNUSED_VARIABLE(to);
483
587
  EIGEN_DEBUG_ALIGNED_STORE
484
- #ifdef __VSX__
588
+ #ifdef EIGEN_VECTORIZE_VSX
485
589
  vec_xst(from, 0, to);
486
590
  #else
487
591
  vec_st(from, 0, to);
488
592
  #endif
489
593
  }
490
594
 
491
- template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
492
- {
595
+ template <>
596
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
493
597
  pstore_common<Packet4f>(to, from);
494
598
  }
495
599
 
496
- template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
497
- {
600
+ template <>
601
+ EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
498
602
  pstore_common<Packet4i>(to, from);
499
603
  }
500
604
 
501
- template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from)
502
- {
605
+ template <>
606
+ EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
503
607
  pstore_common<Packet8s>(to, from);
504
608
  }
505
609
 
506
- template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from)
507
- {
610
+ template <>
611
+ EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
508
612
  pstore_common<Packet8us>(to, from);
509
613
  }
510
614
 
511
- template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from)
512
- {
513
- pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
615
+ template <>
616
+ EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
617
+ pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
514
618
  }
515
619
 
516
- template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from)
517
- {
620
+ template <>
621
+ EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
518
622
  pstore_common<Packet16c>(to, from);
519
623
  }
520
624
 
521
- template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from)
522
- {
625
+ template <>
626
+ EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
523
627
  pstore_common<Packet16uc>(to, from);
524
628
  }
525
629
 
526
- template<typename Packet>
527
- EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
528
- {
630
+ template <typename Packet>
631
+ EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
632
+ const Index offset) {
633
+ // some versions of GCC throw "unused-but-set-parameter" (float *to).
634
+ // ignoring these warnings for now.
635
+ const Index packet_size = unpacket_traits<Packet>::size;
636
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
637
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
638
+ #ifdef _ARCH_PWR9
639
+ EIGEN_UNUSED_VARIABLE(packet_size);
640
+ EIGEN_UNUSED_VARIABLE(to);
641
+ EIGEN_DEBUG_ALIGNED_STORE
642
+ Packet store = from;
643
+ if (offset) {
644
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
645
+ #ifdef _BIG_ENDIAN
646
+ store = Packet(vec_slo(Packet16uc(store), shift));
647
+ #else
648
+ store = Packet(vec_sro(Packet16uc(store), shift));
649
+ #endif
650
+ }
651
+ vec_xst_len(store, to, n * size);
652
+ #else
653
+ if (n) {
654
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
655
+ pstore(store, from);
656
+ unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
657
+ unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
658
+ Index n2 = n * size;
659
+ if (16 <= n2) {
660
+ pstore(to2, ploadu<Packet16uc>(store2));
661
+ } else {
662
+ memcpy((void*)to2, (void*)store2, n2);
663
+ }
664
+ }
665
+ #endif
666
+ }
667
+
668
+ template <>
669
+ EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
670
+ pstore_partial_common<Packet4f>(to, from, n, offset);
671
+ }
672
+
673
+ template <>
674
+ EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
675
+ pstore_partial_common<Packet4i>(to, from, n, offset);
676
+ }
677
+
678
+ template <>
679
+ EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
680
+ const Index offset) {
681
+ pstore_partial_common<Packet8s>(to, from, n, offset);
682
+ }
683
+
684
+ template <>
685
+ EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
686
+ const Index n, const Index offset) {
687
+ pstore_partial_common<Packet8us>(to, from, n, offset);
688
+ }
689
+
690
+ template <>
691
+ EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
692
+ const Index offset) {
693
+ pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
694
+ }
695
+
696
+ template <>
697
+ EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
698
+ const Index offset) {
699
+ pstore_partial_common<Packet16c>(to, from, n, offset);
700
+ }
701
+
702
+ template <>
703
+ EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
704
+ const Index offset) {
705
+ pstore_partial_common<Packet16uc>(to, from, n, offset);
706
+ }
707
+
708
+ template <typename Packet>
709
+ EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
529
710
  Packet v = {from, from, from, from};
530
711
  return v;
531
712
  }
532
713
 
533
- template<typename Packet>
534
- EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
535
- {
714
+ template <typename Packet>
715
+ EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
536
716
  Packet v = {from, from, from, from, from, from, from, from};
537
717
  return v;
538
718
  }
539
719
 
540
- template<typename Packet>
541
- EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
542
- {
720
+ template <typename Packet>
721
+ EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
543
722
  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
544
723
  return v;
545
724
  }
546
725
 
547
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
726
+ template <>
727
+ EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
548
728
  return pset1_size4<Packet4f>(from);
549
729
  }
550
730
 
551
- template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
731
+ template <>
732
+ EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
552
733
  return pset1_size4<Packet4i>(from);
553
734
  }
554
735
 
555
- template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
736
+ template <>
737
+ EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
556
738
  return pset1_size8<Packet8s>(from);
557
739
  }
558
740
 
559
- template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
741
+ template <>
742
+ EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
560
743
  return pset1_size8<Packet8us>(from);
561
744
  }
562
745
 
563
- template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
746
+ template <>
747
+ EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
564
748
  return pset1_size16<Packet16c>(from);
565
749
  }
566
750
 
567
- template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
751
+ template <>
752
+ EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
568
753
  return pset1_size16<Packet16uc>(from);
569
754
  }
570
755
 
571
- template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
756
+ template <>
757
+ EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
572
758
  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
573
759
  }
574
760
 
575
- template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
761
+ template <>
762
+ EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
576
763
  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
577
764
  }
578
765
 
579
- template<typename Packet> EIGEN_STRONG_INLINE void
580
- pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
581
- Packet& a0, Packet& a1, Packet& a2, Packet& a3)
582
- {
766
+ template <typename Packet>
767
+ EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
768
+ Packet& a3) {
583
769
  a3 = pload<Packet>(a);
584
770
  a0 = vec_splat(a3, 0);
585
771
  a1 = vec_splat(a3, 1);
@@ -587,781 +773,1514 @@ pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
587
773
  a3 = vec_splat(a3, 3);
588
774
  }
589
775
 
590
- template<> EIGEN_STRONG_INLINE void
591
- pbroadcast4<Packet4f>(const float *a,
592
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
593
- {
776
+ template <>
777
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
594
778
  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
595
779
  }
596
- template<> EIGEN_STRONG_INLINE void
597
- pbroadcast4<Packet4i>(const int *a,
598
- Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
599
- {
780
+ template <>
781
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
600
782
  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
601
783
  }
602
784
 
603
- template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
604
- {
605
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
606
- a[0] = from[0*stride];
607
- a[1] = from[1*stride];
608
- a[2] = from[2*stride];
609
- a[3] = from[3*stride];
610
- return pload<Packet>(a);
785
+ template <typename Packet>
786
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
787
+ const Index n = unpacket_traits<Packet>::size) {
788
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
789
+ eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
790
+ if (stride == 1) {
791
+ if (n == unpacket_traits<Packet>::size) {
792
+ return ploadu<Packet>(from);
793
+ } else {
794
+ return ploadu_partial<Packet>(from, n);
795
+ }
796
+ } else {
797
+ LOAD_STORE_UNROLL_16
798
+ for (Index i = 0; i < n; i++) {
799
+ a[i] = from[i * stride];
800
+ }
801
+ // Leave rest of the array uninitialized
802
+ return pload_ignore<Packet>(a);
803
+ }
611
804
  }
612
805
 
613
- template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
614
- {
806
+ template <>
807
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
615
808
  return pgather_common<Packet4f>(from, stride);
616
809
  }
617
810
 
618
- template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
619
- {
811
+ template <>
812
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
620
813
  return pgather_common<Packet4i>(from, stride);
621
814
  }
622
815
 
623
- template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
624
- {
625
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
626
- a[0] = from[0*stride];
627
- a[1] = from[1*stride];
628
- a[2] = from[2*stride];
629
- a[3] = from[3*stride];
630
- a[4] = from[4*stride];
631
- a[5] = from[5*stride];
632
- a[6] = from[6*stride];
633
- a[7] = from[7*stride];
634
- return pload<Packet>(a);
816
+ template <>
817
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
818
+ return pgather_common<Packet8s>(from, stride);
635
819
  }
636
820
 
637
- template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
638
- {
639
- return pgather_size8<Packet8s>(from, stride);
821
+ template <>
822
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
823
+ Index stride) {
824
+ return pgather_common<Packet8us>(from, stride);
640
825
  }
641
826
 
642
- template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
643
- {
644
- return pgather_size8<Packet8us>(from, stride);
827
+ template <>
828
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
829
+ return pgather_common<Packet8bf>(from, stride);
645
830
  }
646
831
 
647
- template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
648
- {
649
- return pgather_size8<Packet8bf>(from, stride);
832
+ template <>
833
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
834
+ return pgather_common<Packet16c>(from, stride);
650
835
  }
651
836
 
652
- template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
653
- {
654
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
655
- a[0] = from[0*stride];
656
- a[1] = from[1*stride];
657
- a[2] = from[2*stride];
658
- a[3] = from[3*stride];
659
- a[4] = from[4*stride];
660
- a[5] = from[5*stride];
661
- a[6] = from[6*stride];
662
- a[7] = from[7*stride];
663
- a[8] = from[8*stride];
664
- a[9] = from[9*stride];
665
- a[10] = from[10*stride];
666
- a[11] = from[11*stride];
667
- a[12] = from[12*stride];
668
- a[13] = from[13*stride];
669
- a[14] = from[14*stride];
670
- a[15] = from[15*stride];
671
- return pload<Packet>(a);
837
+ template <>
838
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
839
+ Index stride) {
840
+ return pgather_common<Packet16uc>(from, stride);
672
841
  }
673
842
 
843
+ template <>
844
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
845
+ const Index n) {
846
+ return pgather_common<Packet4f>(from, stride, n);
847
+ }
674
848
 
675
- template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
676
- {
677
- return pgather_size16<Packet16c>(from, stride);
849
+ template <>
850
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
851
+ const Index n) {
852
+ return pgather_common<Packet4i>(from, stride, n);
678
853
  }
679
854
 
680
- template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
681
- {
682
- return pgather_size16<Packet16uc>(from, stride);
683
- }
684
-
685
- template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
686
- {
687
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
688
- pstore<__UNPACK_TYPE__(Packet)>(a, from);
689
- to[0*stride] = a[0];
690
- to[1*stride] = a[1];
691
- to[2*stride] = a[2];
692
- to[3*stride] = a[3];
855
+ template <>
856
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
857
+ const Index n) {
858
+ return pgather_common<Packet8s>(from, stride, n);
693
859
  }
694
860
 
695
- template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
696
- {
697
- pscatter_size4<Packet4f>(to, from, stride);
698
- }
699
-
700
- template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
701
- {
702
- pscatter_size4<Packet4i>(to, from, stride);
861
+ template <>
862
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
863
+ pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
864
+ return pgather_common<Packet8us>(from, stride, n);
703
865
  }
704
-
705
- template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
706
- {
707
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
708
- pstore<__UNPACK_TYPE__(Packet)>(a, from);
709
- to[0*stride] = a[0];
710
- to[1*stride] = a[1];
711
- to[2*stride] = a[2];
712
- to[3*stride] = a[3];
713
- to[4*stride] = a[4];
714
- to[5*stride] = a[5];
715
- to[6*stride] = a[6];
716
- to[7*stride] = a[7];
717
- }
718
-
719
-
720
- template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
721
- {
722
- pscatter_size8<Packet8s>(to, from, stride);
723
- }
724
-
725
- template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
726
- {
727
- pscatter_size8<Packet8us>(to, from, stride);
728
- }
729
-
730
- template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
731
- {
732
- pscatter_size8<Packet8bf>(to, from, stride);
866
+
867
+ template <>
868
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
869
+ const Index n) {
870
+ return pgather_common<Packet8bf>(from, stride, n);
733
871
  }
734
872
 
735
- template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
736
- {
737
- EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
738
- pstore<__UNPACK_TYPE__(Packet)>(a, from);
739
- to[0*stride] = a[0];
740
- to[1*stride] = a[1];
741
- to[2*stride] = a[2];
742
- to[3*stride] = a[3];
743
- to[4*stride] = a[4];
744
- to[5*stride] = a[5];
745
- to[6*stride] = a[6];
746
- to[7*stride] = a[7];
747
- to[8*stride] = a[8];
748
- to[9*stride] = a[9];
749
- to[10*stride] = a[10];
750
- to[11*stride] = a[11];
751
- to[12*stride] = a[12];
752
- to[13*stride] = a[13];
753
- to[14*stride] = a[14];
754
- to[15*stride] = a[15];
755
- }
756
-
757
- template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
758
- {
759
- pscatter_size16<Packet16c>(to, from, stride);
760
- }
761
-
762
- template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
763
- {
764
- pscatter_size16<Packet16uc>(to, from, stride);
873
+ template <>
874
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
875
+ Index stride, const Index n) {
876
+ return pgather_common<Packet16c>(from, stride, n);
765
877
  }
766
878
 
767
- template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
768
- template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
769
- template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
770
- template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
771
- template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
772
- template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
879
+ template <>
880
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
881
+ Index stride,
882
+ const Index n) {
883
+ return pgather_common<Packet16uc>(from, stride, n);
884
+ }
773
885
 
774
- template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; }
775
- template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; }
776
- template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; }
777
- template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; }
778
- template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; }
779
- template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; }
780
- template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
781
-
782
- template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; }
783
- template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; }
784
- template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; }
785
- template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; }
786
- template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
787
- template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
788
-
789
- template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
790
- template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
886
+ template <typename Packet>
887
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
888
+ Index stride,
889
+ const Index n = unpacket_traits<Packet>::size) {
890
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
891
+ eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
892
+ if (stride == 1) {
893
+ if (n == unpacket_traits<Packet>::size) {
894
+ return pstoreu(to, from);
895
+ } else {
896
+ return pstoreu_partial(to, from, n);
897
+ }
898
+ } else {
899
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
900
+ LOAD_STORE_UNROLL_16
901
+ for (Index i = 0; i < n; i++) {
902
+ to[i * stride] = a[i];
903
+ }
904
+ }
905
+ }
791
906
 
792
- template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
793
- template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
907
+ template <>
908
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
909
+ pscatter_common<Packet4f>(to, from, stride);
910
+ }
794
911
 
795
- template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
796
- template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; }
797
- template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); }
798
- template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); }
799
- template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
800
- template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
801
-
802
-
803
- template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
804
- {
805
- #ifndef __VSX__ // VSX actually provides a div instruction
806
- Packet4f t, y_0, y_1;
912
+ template <>
913
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
914
+ pscatter_common<Packet4i>(to, from, stride);
915
+ }
807
916
 
808
- // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
809
- y_0 = vec_re(b);
917
+ template <>
918
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
919
+ Index stride) {
920
+ pscatter_common<Packet8s>(to, from, stride);
921
+ }
810
922
 
811
- // Do one Newton-Raphson iteration to get the needed accuracy
812
- t = vec_nmsub(y_0, b, p4f_ONE);
813
- y_1 = vec_madd(y_0, t, y_0);
923
+ template <>
924
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
925
+ const Packet8us& from,
926
+ Index stride) {
927
+ pscatter_common<Packet8us>(to, from, stride);
928
+ }
814
929
 
815
- return vec_madd(a, y_1, p4f_MZERO);
930
+ template <>
931
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
932
+ Index stride) {
933
+ pscatter_common<Packet8bf>(to, from, stride);
934
+ }
935
+
936
+ template <>
937
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
938
+ Index stride) {
939
+ pscatter_common<Packet16c>(to, from, stride);
940
+ }
941
+
942
+ template <>
943
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
944
+ const Packet16uc& from, Index stride) {
945
+ pscatter_common<Packet16uc>(to, from, stride);
946
+ }
947
+
948
+ template <>
949
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
950
+ Index stride, const Index n) {
951
+ pscatter_common<Packet4f>(to, from, stride, n);
952
+ }
953
+
954
+ template <>
955
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
956
+ const Index n) {
957
+ pscatter_common<Packet4i>(to, from, stride, n);
958
+ }
959
+
960
+ template <>
961
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
962
+ Index stride, const Index n) {
963
+ pscatter_common<Packet8s>(to, from, stride, n);
964
+ }
965
+
966
+ template <>
967
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
968
+ const Packet8us& from,
969
+ Index stride,
970
+ const Index n) {
971
+ pscatter_common<Packet8us>(to, from, stride, n);
972
+ }
973
+
974
+ template <>
975
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
976
+ Index stride, const Index n) {
977
+ pscatter_common<Packet8bf>(to, from, stride, n);
978
+ }
979
+
980
+ template <>
981
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
982
+ const Packet16c& from, Index stride,
983
+ const Index n) {
984
+ pscatter_common<Packet16c>(to, from, stride, n);
985
+ }
986
+
987
+ template <>
988
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
989
+ const Packet16uc& from,
990
+ Index stride, const Index n) {
991
+ pscatter_common<Packet16uc>(to, from, stride, n);
992
+ }
993
+
994
+ template <>
995
+ EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
996
+ return pset1<Packet4f>(a) + p4f_COUNTDOWN;
997
+ }
998
+ template <>
999
+ EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
1000
+ return pset1<Packet4i>(a) + p4i_COUNTDOWN;
1001
+ }
1002
+ template <>
1003
+ EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
1004
+ return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1005
+ }
1006
+ template <>
1007
+ EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
1008
+ return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1009
+ }
1010
+ template <>
1011
+ EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
1012
+ return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1013
+ }
1014
+ template <>
1015
+ EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
1016
+ return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1017
+ }
1018
+
1019
+ template <>
1020
+ EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1021
+ return a + b;
1022
+ }
1023
+ template <>
1024
+ EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
1025
+ return a + b;
1026
+ }
1027
+ template <>
1028
+ EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1029
+ return a + b;
1030
+ }
1031
+ template <>
1032
+ EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
1033
+ return a + b;
1034
+ }
1035
+ template <>
1036
+ EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
1037
+ return a + b;
1038
+ }
1039
+ template <>
1040
+ EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
1041
+ return a + b;
1042
+ }
1043
+ template <>
1044
+ EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1045
+ return a + b;
1046
+ }
1047
+
1048
+ template <>
1049
+ EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1050
+ return a - b;
1051
+ }
1052
+ template <>
1053
+ EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
1054
+ return a - b;
1055
+ }
1056
+ template <>
1057
+ EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
1058
+ return a - b;
1059
+ }
1060
+ template <>
1061
+ EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
1062
+ return a - b;
1063
+ }
1064
+ template <>
1065
+ EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
1066
+ return a - b;
1067
+ }
1068
+ template <>
1069
+ EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1070
+ return a - b;
1071
+ }
1072
+
1073
+ template <>
1074
+ EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
1075
+ #ifdef __POWER8_VECTOR__
1076
+ return vec_neg(a);
1077
+ #else
1078
+ return vec_xor(a, p4f_MZERO);
1079
+ #endif
1080
+ }
1081
+ template <>
1082
+ EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
1083
+ #ifdef __POWER8_VECTOR__
1084
+ return vec_neg(a);
1085
+ #else
1086
+ return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
1087
+ #endif
1088
+ }
1089
+ template <>
1090
+ EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
1091
+ #ifdef __POWER8_VECTOR__
1092
+ return vec_neg(a);
1093
+ #else
1094
+ return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
1095
+ #endif
1096
+ }
1097
+ template <>
1098
+ EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
1099
+ #ifdef __POWER8_VECTOR__
1100
+ return vec_neg(a);
1101
+ #else
1102
+ return p4i_ZERO - a;
1103
+ #endif
1104
+ }
1105
+
1106
+ template <>
1107
+ EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
1108
+ return a;
1109
+ }
1110
+ template <>
1111
+ EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
1112
+ return a;
1113
+ }
1114
+
1115
+ template <>
1116
+ EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1117
+ return vec_madd(a, b, p4f_MZERO);
1118
+ }
1119
+ template <>
1120
+ EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1121
+ return a * b;
1122
+ }
1123
+ template <>
1124
+ EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1125
+ return vec_mul(a, b);
1126
+ }
1127
+ template <>
1128
+ EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1129
+ return vec_mul(a, b);
1130
+ }
1131
+ template <>
1132
+ EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
1133
+ return vec_mul(a, b);
1134
+ }
1135
+ template <>
1136
+ EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1137
+ return vec_mul(a, b);
1138
+ }
1139
+
1140
+ template <>
1141
+ EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1142
+ #ifndef __VSX__ // VSX actually provides a div instruction
1143
+ Packet4f t, y_0, y_1;
1144
+
1145
+ // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
1146
+ y_0 = vec_re(b);
1147
+
1148
+ // Do one Newton-Raphson iteration to get the needed accuracy
1149
+ t = vec_nmsub(y_0, b, p4f_ONE);
1150
+ y_1 = vec_madd(y_0, t, y_0);
1151
+
1152
+ return vec_madd(a, y_1, p4f_MZERO);
816
1153
  #else
817
1154
  return vec_div(a, b);
818
1155
  #endif
819
1156
  }
820
1157
 
821
- template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
822
- { eigen_assert(false && "packet integer division are not supported by AltiVec");
1158
+ template <>
1159
+ EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
1160
+ #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1161
+ return vec_div(a, b);
1162
+ #else
1163
+ EIGEN_UNUSED_VARIABLE(a);
1164
+ EIGEN_UNUSED_VARIABLE(b);
1165
+ eigen_assert(false && "packet integer division are not supported by AltiVec");
823
1166
  return pset1<Packet4i>(0);
1167
+ #endif
824
1168
  }
825
1169
 
826
1170
  // for some weird raisons, it has to be overloaded for packet of integers
827
- template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
828
- template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
829
- template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
830
- template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
831
-
832
- template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
833
- {
834
- #ifdef __VSX__
1171
+ template <>
1172
+ EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1173
+ return vec_madd(a, b, c);
1174
+ }
1175
+ template <>
1176
+ EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1177
+ return a * b + c;
1178
+ }
1179
+ template <>
1180
+ EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1181
+ return vec_madd(a, b, c);
1182
+ }
1183
+ template <>
1184
+ EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1185
+ return vec_madd(a, b, c);
1186
+ }
1187
+
1188
+ #ifdef EIGEN_VECTORIZE_VSX
1189
+ template <>
1190
+ EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1191
+ return vec_msub(a, b, c);
1192
+ }
1193
+ template <>
1194
+ EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1195
+ return vec_nmsub(a, b, c);
1196
+ }
1197
+ template <>
1198
+ EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1199
+ return vec_nmadd(a, b, c);
1200
+ }
1201
+ #endif
1202
+
1203
+ template <>
1204
+ EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1205
+ #ifdef EIGEN_VECTORIZE_VSX
835
1206
  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
836
1207
  Packet4f ret;
837
- __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
1208
+ __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
838
1209
  return ret;
839
- #else
1210
+ #else
1211
+ return vec_min(a, b);
1212
+ #endif
1213
+ }
1214
+ template <>
1215
+ EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1216
+ return vec_min(a, b);
1217
+ }
1218
+ template <>
1219
+ EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1220
+ return vec_min(a, b);
1221
+ }
1222
+ template <>
1223
+ EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1224
+ return vec_min(a, b);
1225
+ }
1226
+ template <>
1227
+ EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1228
+ return vec_min(a, b);
1229
+ }
1230
+ template <>
1231
+ EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
840
1232
  return vec_min(a, b);
841
- #endif
842
1233
  }
843
- template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
844
- template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
845
- template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
846
- template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
847
- template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
848
-
849
1234
 
850
- template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
851
- {
852
- #ifdef __VSX__
1235
+ template <>
1236
+ EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1237
+ #ifdef EIGEN_VECTORIZE_VSX
853
1238
  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
854
1239
  Packet4f ret;
855
- __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
1240
+ __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
856
1241
  return ret;
857
- #else
1242
+ #else
858
1243
  return vec_max(a, b);
859
- #endif
860
- }
861
- template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
862
- template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
863
- template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
864
- template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
865
- template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
866
-
867
- template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
868
- template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
869
- template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
870
- template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
871
- Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
872
- return vec_nor(c,c);
873
- }
874
-
875
- template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
876
- template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
877
- template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
878
- template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
879
- template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
880
- template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
881
- template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
882
- template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
883
- template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
884
- template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
885
- template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
886
- template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
887
- template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
888
- template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
889
- template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
890
-
891
- template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
892
- template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
893
- template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
894
- template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
895
- template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
896
- return pand<Packet8us>(a, b);
1244
+ #endif
1245
+ }
1246
+ template <>
1247
+ EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1248
+ return vec_max(a, b);
1249
+ }
1250
+ template <>
1251
+ EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1252
+ return vec_max(a, b);
1253
+ }
1254
+ template <>
1255
+ EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1256
+ return vec_max(a, b);
1257
+ }
1258
+ template <>
1259
+ EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1260
+ return vec_max(a, b);
1261
+ }
1262
+ template <>
1263
+ EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1264
+ return vec_max(a, b);
1265
+ }
1266
+
1267
+ template <>
1268
+ EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
1269
+ return reinterpret_cast<Packet4f>(vec_cmple(a, b));
1270
+ }
1271
+ // To fix bug with vec_cmplt on older versions
1272
+ #ifdef EIGEN_VECTORIZE_VSX
1273
+ template <>
1274
+ EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
1275
+ return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
1276
+ }
1277
+ #endif
1278
+ template <>
1279
+ EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
1280
+ return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
1281
+ }
1282
+ template <>
1283
+ EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
1284
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
1285
+ return vec_nor(c, c);
1286
+ }
1287
+
1288
+ #ifdef EIGEN_VECTORIZE_VSX
1289
+ template <>
1290
+ EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
1291
+ return reinterpret_cast<Packet4i>(vec_cmple(a, b));
1292
+ }
1293
+ #endif
1294
+ template <>
1295
+ EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
1296
+ return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
1297
+ }
1298
+ template <>
1299
+ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
1300
+ return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
1301
+ }
1302
+ #ifdef EIGEN_VECTORIZE_VSX
1303
+ template <>
1304
+ EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
1305
+ return reinterpret_cast<Packet8s>(vec_cmple(a, b));
1306
+ }
1307
+ #endif
1308
+ template <>
1309
+ EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
1310
+ return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
1311
+ }
1312
+ template <>
1313
+ EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
1314
+ return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
1315
+ }
1316
+ #ifdef EIGEN_VECTORIZE_VSX
1317
+ template <>
1318
+ EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
1319
+ return reinterpret_cast<Packet8us>(vec_cmple(a, b));
1320
+ }
1321
+ #endif
1322
+ template <>
1323
+ EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
1324
+ return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
1325
+ }
1326
+ template <>
1327
+ EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
1328
+ return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
1329
+ }
1330
+ #ifdef EIGEN_VECTORIZE_VSX
1331
+ template <>
1332
+ EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
1333
+ return reinterpret_cast<Packet16c>(vec_cmple(a, b));
1334
+ }
1335
+ #endif
1336
+ template <>
1337
+ EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
1338
+ return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
1339
+ }
1340
+ template <>
1341
+ EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
1342
+ return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
1343
+ }
1344
+ #ifdef EIGEN_VECTORIZE_VSX
1345
+ template <>
1346
+ EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
1347
+ return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
1348
+ }
1349
+ #endif
1350
+ template <>
1351
+ EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
1352
+ return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
1353
+ }
1354
+ template <>
1355
+ EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
1356
+ return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
897
1357
  }
898
1358
 
1359
+ template <>
1360
+ EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1361
+ return vec_and(a, b);
1362
+ }
1363
+ template <>
1364
+ EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1365
+ return vec_and(a, b);
1366
+ }
1367
+ template <>
1368
+ EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1369
+ return vec_and(a, b);
1370
+ }
1371
+ template <>
1372
+ EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1373
+ return vec_and(a, b);
1374
+ }
1375
+ template <>
1376
+ EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1377
+ return pand<Packet8us>(a, b);
1378
+ }
899
1379
 
900
- template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
901
- template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
902
- template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
903
- template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
904
- template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1380
+ template <>
1381
+ EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1382
+ return vec_or(a, b);
1383
+ }
1384
+ template <>
1385
+ EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1386
+ return vec_or(a, b);
1387
+ }
1388
+ template <>
1389
+ EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1390
+ return vec_or(a, b);
1391
+ }
1392
+ template <>
1393
+ EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1394
+ return vec_or(a, b);
1395
+ }
1396
+ template <>
1397
+ EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
905
1398
  return por<Packet8us>(a, b);
906
1399
  }
907
1400
 
908
- template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
909
- template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
910
- template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1401
+ template <>
1402
+ EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1403
+ return vec_xor(a, b);
1404
+ }
1405
+ template <>
1406
+ EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1407
+ return vec_xor(a, b);
1408
+ }
1409
+ template <>
1410
+ EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1411
+ return vec_xor(a, b);
1412
+ }
1413
+ template <>
1414
+ EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
911
1415
  return pxor<Packet8us>(a, b);
912
1416
  }
913
1417
 
914
- template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
915
- template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
1418
+ template <>
1419
+ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1420
+ return vec_andc(a, b);
1421
+ }
1422
+ template <>
1423
+ EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1424
+ return vec_andc(a, b);
1425
+ }
916
1426
 
917
- template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
1427
+ template <>
1428
+ EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
918
1429
  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
919
1430
  }
920
1431
 
921
- template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
922
- {
923
- Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
924
- Packet4f res;
1432
+ template <>
1433
+ EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1434
+ Packet4f t = vec_add(
1435
+ reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1436
+ Packet4f res;
925
1437
 
926
- #ifdef __VSX__
927
- __asm__("xvrspiz %x0, %x1\n\t"
928
- : "=&wa" (res)
929
- : "wa" (t));
1438
+ #ifdef EIGEN_VECTORIZE_VSX
1439
+ __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
930
1440
  #else
931
- __asm__("vrfiz %0, %1\n\t"
932
- : "=v" (res)
933
- : "v" (t));
1441
+ __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
934
1442
  #endif
935
1443
 
936
- return res;
1444
+ return res;
1445
+ }
1446
+ template <>
1447
+ EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1448
+ return vec_ceil(a);
1449
+ }
1450
+ template <>
1451
+ EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1452
+ return vec_floor(a);
1453
+ }
1454
+ template <>
1455
+ EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1456
+ return vec_trunc(a);
1457
+ }
1458
+ #ifdef EIGEN_VECTORIZE_VSX
1459
+ template <>
1460
+ EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1461
+ Packet4f res;
1462
+
1463
+ __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
1464
+
1465
+ return res;
937
1466
  }
938
- template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
939
- template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
940
- template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
941
- {
942
- Packet4f res;
1467
+ #endif
943
1468
 
944
- __asm__("xvrspic %x0, %x1\n\t"
945
- : "=&wa" (res)
946
- : "wa" (a));
1469
+ template <typename Packet>
1470
+ EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
1471
+ EIGEN_DEBUG_UNALIGNED_LOAD
1472
+ #if defined(EIGEN_VECTORIZE_VSX)
1473
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1474
+ #else
1475
+ Packet16uc MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
1476
+ Packet16uc LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
1477
+ Packet16uc mask = vec_lvsl(0, from); // create the permute mask
1478
+ // TODO: Add static_cast here
1479
+ return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
1480
+ #endif
1481
+ }
947
1482
 
948
- return res;
1483
+ template <>
1484
+ EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1485
+ return ploadu_common<Packet4f>(from);
1486
+ }
1487
+ template <>
1488
+ EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1489
+ return ploadu_common<Packet4i>(from);
1490
+ }
1491
+ template <>
1492
+ EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
1493
+ return ploadu_common<Packet8s>(from);
1494
+ }
1495
+ template <>
1496
+ EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
1497
+ return ploadu_common<Packet8us>(from);
1498
+ }
1499
+ template <>
1500
+ EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
1501
+ return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1502
+ }
1503
+ template <>
1504
+ EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
1505
+ return ploadu_common<Packet16c>(from);
1506
+ }
1507
+ template <>
1508
+ EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
1509
+ return ploadu_common<Packet16uc>(from);
949
1510
  }
950
1511
 
951
- template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
952
- {
1512
+ template <typename Packet>
1513
+ EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
1514
+ const Index offset) {
1515
+ const Index packet_size = unpacket_traits<Packet>::size;
1516
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
1517
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
1518
+ #ifdef _ARCH_PWR9
1519
+ EIGEN_UNUSED_VARIABLE(packet_size);
953
1520
  EIGEN_DEBUG_ALIGNED_LOAD
1521
+ EIGEN_DEBUG_UNALIGNED_LOAD
1522
+ Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
1523
+ if (offset) {
1524
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
954
1525
  #ifdef _BIG_ENDIAN
955
- Packet16uc MSQ, LSQ;
956
- Packet16uc mask;
957
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
958
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
959
- mask = vec_lvsl(0, from); // create the permute mask
960
- //TODO: Add static_cast here
961
- return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
1526
+ load = Packet(vec_sro(Packet16uc(load), shift));
962
1527
  #else
963
- EIGEN_DEBUG_UNALIGNED_LOAD
964
- return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1528
+ load = Packet(vec_slo(Packet16uc(load), shift));
1529
+ #endif
1530
+ }
1531
+ return load;
1532
+ #else
1533
+ if (n) {
1534
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1535
+ unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
1536
+ unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
1537
+ Index n2 = n * size;
1538
+ if (16 <= n2) {
1539
+ pstoreu(load2, ploadu<Packet16uc>(from2));
1540
+ } else {
1541
+ memcpy((void*)load2, (void*)from2, n2);
1542
+ }
1543
+ return pload_ignore<Packet>(load);
1544
+ } else {
1545
+ return Packet(pset1<Packet16uc>(0));
1546
+ }
965
1547
  #endif
966
1548
  }
967
1549
 
968
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
969
- {
970
- return ploadu_common<Packet4f>(from);
1550
+ template <>
1551
+ EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
1552
+ return ploadu_partial_common<Packet4f>(from, n, offset);
971
1553
  }
972
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
973
- {
974
- return ploadu_common<Packet4i>(from);
1554
+ template <>
1555
+ EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
1556
+ return ploadu_partial_common<Packet4i>(from, n, offset);
975
1557
  }
976
- template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
977
- {
978
- return ploadu_common<Packet8s>(from);
1558
+ template <>
1559
+ EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
1560
+ return ploadu_partial_common<Packet8s>(from, n, offset);
979
1561
  }
980
- template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
981
- {
982
- return ploadu_common<Packet8us>(from);
1562
+ template <>
1563
+ EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
1564
+ const Index offset) {
1565
+ return ploadu_partial_common<Packet8us>(from, n, offset);
983
1566
  }
984
- template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
985
- {
986
- return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1567
+ template <>
1568
+ EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
1569
+ return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
987
1570
  }
988
- template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
989
- {
990
- return ploadu_common<Packet16c>(from);
1571
+ template <>
1572
+ EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
1573
+ return ploadu_partial_common<Packet16c>(from, n, offset);
991
1574
  }
992
- template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
993
- {
994
- return ploadu_common<Packet16uc>(from);
1575
+ template <>
1576
+ EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
1577
+ const Index offset) {
1578
+ return ploadu_partial_common<Packet16uc>(from, n, offset);
995
1579
  }
996
1580
 
997
- template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
998
- {
1581
+ template <typename Packet>
1582
+ EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
999
1583
  Packet p;
1000
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
1001
- else p = ploadu<Packet>(from);
1002
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1584
+ if ((std::ptrdiff_t(from) % 16) == 0)
1585
+ p = pload<Packet>(from);
1586
+ else
1587
+ p = ploadu<Packet>(from);
1588
+ return vec_mergeh(p, p);
1003
1589
  }
1004
- template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1005
- {
1590
+ template <>
1591
+ EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1006
1592
  return ploaddup_common<Packet4f>(from);
1007
1593
  }
1008
- template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
1009
- {
1594
+ template <>
1595
+ EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1010
1596
  return ploaddup_common<Packet4i>(from);
1011
1597
  }
1012
1598
 
1013
- template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from)
1014
- {
1599
+ template <>
1600
+ EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
1015
1601
  Packet8s p;
1016
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1017
- else p = ploadu<Packet8s>(from);
1018
- return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1602
+ if ((std::ptrdiff_t(from) % 16) == 0)
1603
+ p = pload<Packet8s>(from);
1604
+ else
1605
+ p = ploadu<Packet8s>(from);
1606
+ return vec_mergeh(p, p);
1019
1607
  }
1020
1608
 
1021
- template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
1022
- {
1609
+ template <>
1610
+ EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
1023
1611
  Packet8us p;
1024
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1025
- else p = ploadu<Packet8us>(from);
1026
- return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1612
+ if ((std::ptrdiff_t(from) % 16) == 0)
1613
+ p = pload<Packet8us>(from);
1614
+ else
1615
+ p = ploadu<Packet8us>(from);
1616
+ return vec_mergeh(p, p);
1027
1617
  }
1028
1618
 
1029
- template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
1030
- {
1619
+ template <>
1620
+ EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
1031
1621
  Packet8s p;
1032
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1033
- else p = ploadu<Packet8s>(from);
1622
+ if ((std::ptrdiff_t(from) % 16) == 0)
1623
+ p = pload<Packet8s>(from);
1624
+ else
1625
+ p = ploadu<Packet8s>(from);
1034
1626
  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1035
1627
  }
1036
1628
 
1037
- template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from)
1038
- {
1629
+ template <>
1630
+ EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
1039
1631
  Packet8us p;
1040
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1041
- else p = ploadu<Packet8us>(from);
1632
+ if ((std::ptrdiff_t(from) % 16) == 0)
1633
+ p = pload<Packet8us>(from);
1634
+ else
1635
+ p = ploadu<Packet8us>(from);
1042
1636
  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1043
1637
  }
1044
1638
 
1045
- template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from)
1046
- {
1639
+ template <>
1640
+ EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
1047
1641
  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1048
1642
  }
1049
1643
 
1050
- template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from)
1051
- {
1644
+ template <>
1645
+ EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
1646
+ Packet16c p;
1647
+ if ((std::ptrdiff_t(from) % 16) == 0)
1648
+ p = pload<Packet16c>(from);
1649
+ else
1650
+ p = ploadu<Packet16c>(from);
1651
+ return vec_mergeh(p, p);
1652
+ }
1653
+
1654
+ template <>
1655
+ EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
1656
+ Packet16uc p;
1657
+ if ((std::ptrdiff_t(from) % 16) == 0)
1658
+ p = pload<Packet16uc>(from);
1659
+ else
1660
+ p = ploadu<Packet16uc>(from);
1661
+ return vec_mergeh(p, p);
1662
+ }
1663
+
1664
+ template <>
1665
+ EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
1052
1666
  Packet16c p;
1053
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
1054
- else p = ploadu<Packet16c>(from);
1055
- return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1667
+ if ((std::ptrdiff_t(from) % 16) == 0)
1668
+ p = pload<Packet16c>(from);
1669
+ else
1670
+ p = ploadu<Packet16c>(from);
1671
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1056
1672
  }
1057
1673
 
1058
- template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
1059
- {
1674
+ template <>
1675
+ EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
1060
1676
  Packet16uc p;
1061
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
1062
- else p = ploadu<Packet16uc>(from);
1063
- return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1677
+ if ((std::ptrdiff_t(from) % 16) == 0)
1678
+ p = pload<Packet16uc>(from);
1679
+ else
1680
+ p = ploadu<Packet16uc>(from);
1681
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1064
1682
  }
1065
1683
 
1066
- template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
1067
- {
1684
+ template <typename Packet>
1685
+ EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
1068
1686
  EIGEN_DEBUG_UNALIGNED_STORE
1069
- #ifdef _BIG_ENDIAN
1687
+ #if defined(EIGEN_VECTORIZE_VSX)
1688
+ vec_xst(from, 0, to);
1689
+ #else
1070
1690
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1071
1691
  // Warning: not thread safe!
1072
1692
  Packet16uc MSQ, LSQ, edges;
1073
1693
  Packet16uc edgeAlign, align;
1074
1694
 
1075
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
1076
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
1077
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1078
- edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
1079
- align = vec_lvsr( 0, to ); // permute map to misalign data
1080
- MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
1081
- LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
1082
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
1083
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
1084
- #else
1085
- vec_xst(from, 0, to);
1695
+ MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
1696
+ LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
1697
+ edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1698
+ edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
1699
+ align = vec_lvsr(0, to); // permute map to misalign data
1700
+ MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
1701
+ LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
1702
+ vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
1703
+ vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
1086
1704
  #endif
1087
1705
  }
1088
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1089
- {
1706
+ template <>
1707
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1090
1708
  pstoreu_common<Packet4f>(to, from);
1091
1709
  }
1092
- template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
1093
- {
1710
+ template <>
1711
+ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1094
1712
  pstoreu_common<Packet4i>(to, from);
1095
1713
  }
1096
- template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from)
1097
- {
1714
+ template <>
1715
+ EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
1098
1716
  pstoreu_common<Packet8s>(to, from);
1099
1717
  }
1100
- template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from)
1101
- {
1718
+ template <>
1719
+ EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
1102
1720
  pstoreu_common<Packet8us>(to, from);
1103
1721
  }
1104
- template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from)
1105
- {
1106
- pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
1722
+ template <>
1723
+ EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
1724
+ pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
1107
1725
  }
1108
- template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from)
1109
- {
1726
+ template <>
1727
+ EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
1110
1728
  pstoreu_common<Packet16c>(to, from);
1111
1729
  }
1112
- template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from)
1113
- {
1730
+ template <>
1731
+ EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
1114
1732
  pstoreu_common<Packet16uc>(to, from);
1115
1733
  }
1116
1734
 
1117
- template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
1118
- template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
1735
+ template <typename Packet>
1736
+ EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
1737
+ const Index offset) {
1738
+ const Index packet_size = unpacket_traits<Packet>::size;
1739
+ eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
1740
+ const Index size = sizeof(__UNPACK_TYPE__(Packet));
1741
+ #ifdef _ARCH_PWR9
1742
+ EIGEN_UNUSED_VARIABLE(packet_size);
1743
+ EIGEN_DEBUG_UNALIGNED_STORE
1744
+ Packet store = from;
1745
+ if (offset) {
1746
+ Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1747
+ #ifdef _BIG_ENDIAN
1748
+ store = Packet(vec_slo(Packet16uc(store), shift));
1749
+ #else
1750
+ store = Packet(vec_sro(Packet16uc(store), shift));
1751
+ #endif
1752
+ }
1753
+ vec_xst_len(store, to, n * size);
1754
+ #else
1755
+ if (n) {
1756
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1757
+ pstore(store, from);
1758
+ unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
1759
+ unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
1760
+ Index n2 = n * size;
1761
+ if (16 <= n2) {
1762
+ pstoreu(to2, ploadu<Packet16uc>(store2));
1763
+ } else {
1764
+ memcpy((void*)to2, (void*)store2, n2);
1765
+ }
1766
+ }
1767
+ #endif
1768
+ }
1769
+
1770
+ template <>
1771
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
1772
+ pstoreu_partial_common<Packet4f>(to, from, n, offset);
1773
+ }
1774
+ template <>
1775
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
1776
+ pstoreu_partial_common<Packet4i>(to, from, n, offset);
1777
+ }
1778
+ template <>
1779
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
1780
+ const Index offset) {
1781
+ pstoreu_partial_common<Packet8s>(to, from, n, offset);
1782
+ }
1783
+ template <>
1784
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
1785
+ const Index n, const Index offset) {
1786
+ pstoreu_partial_common<Packet8us>(to, from, n, offset);
1787
+ }
1788
+ template <>
1789
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
1790
+ const Index offset) {
1791
+ pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
1792
+ }
1793
+ template <>
1794
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
1795
+ const Index offset) {
1796
+ pstoreu_partial_common<Packet16c>(to, from, n, offset);
1797
+ }
1798
+ template <>
1799
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
1800
+ const Index offset) {
1801
+ pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1802
+ }
1803
+
1804
+ template <>
1805
+ EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1806
+ EIGEN_PPC_PREFETCH(addr);
1807
+ }
1808
+ template <>
1809
+ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1810
+ EIGEN_PPC_PREFETCH(addr);
1811
+ }
1119
1812
 
1120
- template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
1121
- template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
1813
+ template <>
1814
+ EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1815
+ EIGEN_ALIGN16 float x;
1816
+ vec_ste(a, 0, &x);
1817
+ return x;
1818
+ }
1819
+ template <>
1820
+ EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1821
+ EIGEN_ALIGN16 int x;
1822
+ vec_ste(a, 0, &x);
1823
+ return x;
1824
+ }
1122
1825
 
1123
- template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1826
+ template <typename Packet>
1827
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1124
1828
  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1125
1829
  vec_ste(a, 0, &x);
1126
1830
  return x;
1127
1831
  }
1128
1832
 
1129
- template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1833
+ template <>
1834
+ EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1130
1835
  return pfirst_common<Packet8s>(a);
1131
1836
  }
1132
1837
 
1133
- template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1838
+ template <>
1839
+ EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1134
1840
  return pfirst_common<Packet8us>(a);
1135
1841
  }
1136
1842
 
1137
- template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
1138
- {
1843
+ template <>
1844
+ EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
1139
1845
  return pfirst_common<Packet16c>(a);
1140
1846
  }
1141
1847
 
1142
- template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
1143
- {
1848
+ template <>
1849
+ EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
1144
1850
  return pfirst_common<Packet16uc>(a);
1145
1851
  }
1146
1852
 
1147
- template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
1148
- {
1149
- return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1853
+ template <>
1854
+ EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1855
+ return reinterpret_cast<Packet4f>(
1856
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1150
1857
  }
1151
- template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
1152
- {
1153
- return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1858
+ template <>
1859
+ EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1860
+ return reinterpret_cast<Packet4i>(
1861
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1154
1862
  }
1155
- template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
1156
- {
1157
- return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1863
+ template <>
1864
+ EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1865
+ return reinterpret_cast<Packet8s>(
1866
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1158
1867
  }
1159
- template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
1160
- {
1161
- return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1868
+ template <>
1869
+ EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1870
+ return reinterpret_cast<Packet8us>(
1871
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1162
1872
  }
1163
- template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
1164
- {
1873
+ template <>
1874
+ EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1165
1875
  return vec_perm(a, a, p16uc_REVERSE8);
1166
1876
  }
1167
- template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
1168
- {
1877
+ template <>
1878
+ EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1169
1879
  return vec_perm(a, a, p16uc_REVERSE8);
1170
1880
  }
1171
- template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
1172
- {
1881
+ template <>
1882
+ EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
1173
1883
  return preverse<Packet8us>(a);
1174
1884
  }
1175
1885
 
1176
- template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
1177
- template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
1178
- template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
1179
- template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
1180
- template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
1181
- template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
1182
- template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1183
- _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
1886
+ template <>
1887
+ EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1888
+ return vec_abs(a);
1889
+ }
1890
+ template <>
1891
+ EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1892
+ return vec_abs(a);
1893
+ }
1894
+ template <>
1895
+ EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1896
+ return vec_abs(a);
1897
+ }
1898
+ template <>
1899
+ EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1900
+ return a;
1901
+ }
1902
+ template <>
1903
+ EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1904
+ return vec_abs(a);
1905
+ }
1906
+ template <>
1907
+ EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1908
+ return a;
1909
+ }
1910
+ template <>
1911
+ EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1912
+ EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1184
1913
  return pand<Packet8us>(p8us_abs_mask, a);
1185
1914
  }
1186
1915
 
1187
- template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
1188
- { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1189
- template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
1190
- { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1191
- template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
1192
- { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1193
- template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
1194
- {
1195
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1916
+ template <>
1917
+ EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
1918
+ return vec_sra(a.m_val, vec_splat_u16(15));
1919
+ }
1920
+ template <>
1921
+ EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1922
+ return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
1923
+ }
1924
+
1925
+ template <int N>
1926
+ EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1927
+ return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1928
+ }
1929
+ template <int N>
1930
+ EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1931
+ return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1932
+ }
1933
+ template <int N>
1934
+ EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1935
+ return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1936
+ }
1937
+ template <int N>
1938
+ EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
1939
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1196
1940
  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1197
1941
  return reinterpret_cast<Packet4f>(r);
1198
1942
  }
1199
1943
 
1200
- template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
1201
- {
1202
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1944
+ template <int N>
1945
+ EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
1946
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1203
1947
  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1204
1948
  return reinterpret_cast<Packet4f>(r);
1205
1949
  }
1206
1950
 
1207
- template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
1208
- {
1209
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1951
+ template <int N>
1952
+ EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1953
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1210
1954
  return vec_sr(a, p4ui_mask);
1211
1955
  }
1212
1956
 
1213
- template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
1214
- {
1215
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1957
+ template <int N>
1958
+ EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1959
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1216
1960
  return vec_sl(a, p4ui_mask);
1217
1961
  }
1218
1962
 
1219
- template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
1220
- {
1221
- const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1963
+ template <int N>
1964
+ EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1965
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1222
1966
  return vec_sl(a, p8us_mask);
1223
1967
  }
1224
- template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
1225
- {
1226
- const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1968
+ template <int N>
1969
+ EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1970
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1227
1971
  return vec_sr(a, p8us_mask);
1228
1972
  }
1229
1973
 
1230
- EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
1974
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
1231
1975
  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1232
1976
  }
1233
1977
 
1234
- EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
1235
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1236
- return pand<Packet4f>(
1237
- reinterpret_cast<Packet4f>(bf.m_val),
1238
- reinterpret_cast<Packet4f>(p4ui_high_mask)
1239
- );
1978
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
1979
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1980
+ return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
1981
+ }
1982
+
1983
+ EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1984
+ #ifdef _BIG_ENDIAN
1985
+ return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
1986
+ #else
1987
+ return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
1988
+ #endif
1240
1989
  }
1241
1990
 
1242
1991
  // Simple interleaving of bool masks, prevents true values from being
1243
1992
  // converted to NaNs.
1244
1993
  EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1245
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1246
- Packet4f bf_odd, bf_even;
1247
- bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
1248
- bf_even = plogical_shift_right<16>(even);
1249
- return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1994
+ return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
1250
1995
  }
1251
1996
 
1252
- EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
1997
+ // #define SUPPORT_BF16_SUBNORMALS
1998
+
1999
+ #ifndef __VEC_CLASS_FP_NAN
2000
+ #define __VEC_CLASS_FP_NAN (1 << 6)
2001
+ #endif
2002
+
2003
+ #if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
2004
+ #define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2005
+ #define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2006
+
2007
+ #define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2008
+ #endif
2009
+
2010
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2011
+ #ifdef _ARCH_PWR10
2012
+ return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
2013
+ #else
1253
2014
  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
1254
2015
  Packet4ui lsb = plogical_shift_right<16>(input);
1255
2016
  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
1256
2017
 
1257
- _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
2018
+ EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
1258
2019
  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
1259
2020
  input = padd<Packet4ui>(input, rounding_bias);
1260
2021
 
1261
- //Test NaN and Subnormal - Begin
1262
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2022
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2023
+ #if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2024
+ Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2025
+ input = vec_sel(input, p4ui_nan, nan_selector);
2026
+
2027
+ #ifdef SUPPORT_BF16_SUBNORMALS
2028
+ Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2029
+ input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2030
+ #endif
2031
+ #else
2032
+ #ifdef SUPPORT_BF16_SUBNORMALS
2033
+ // Test NaN and Subnormal
2034
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
1263
2035
  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
1264
2036
 
1265
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2037
+ const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
1266
2038
  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
1267
2039
 
1268
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
1269
- Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
1270
- Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
1271
-
2040
+ Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
1272
2041
  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
1273
- Packet4ui nan_selector = pandnot<Packet4ui>(
1274
- reinterpret_cast<Packet4ui>(is_max_exp),
1275
- reinterpret_cast<Packet4ui>(is_mant_zero)
1276
- );
1277
2042
 
1278
- Packet4ui subnormal_selector = pandnot<Packet4ui>(
1279
- reinterpret_cast<Packet4ui>(is_zero_exp),
1280
- reinterpret_cast<Packet4ui>(is_mant_zero)
1281
- );
2043
+ Packet4ui nan_selector =
2044
+ pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2045
+
2046
+ Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
2047
+
2048
+ Packet4ui subnormal_selector =
2049
+ pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
1282
2050
 
1283
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
1284
2051
  input = vec_sel(input, p4ui_nan, nan_selector);
1285
2052
  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
1286
- //Test NaN and Subnormal - End
2053
+ #else
2054
+ // Test only NaN
2055
+ Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2056
+
2057
+ input = vec_sel(p4ui_nan, input, nan_selector);
2058
+ #endif
2059
+ #endif
1287
2060
 
1288
2061
  input = plogical_shift_right<16>(input);
1289
2062
  return reinterpret_cast<Packet8us>(input);
2063
+ #endif
2064
+ }
2065
+
2066
+ #ifdef _BIG_ENDIAN
2067
+ /**
2068
+ * Pack the high portion of two float Packets into one bfloat16 Packet
2069
+ *
2070
+ * @tparam lohi to expect either a low & high OR odd & even order
2071
+ */
2072
+ template <bool lohi>
2073
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2074
+ if (lohi) {
2075
+ return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
2076
+ } else {
2077
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2078
+ }
2079
+ }
2080
+
2081
+ /**
2082
+ * Pack the low portion of two float Packets into one bfloat16 Packet
2083
+ *
2084
+ * @param lohi to expect either a low & high OR odd & even order
2085
+ */
2086
+ template <bool lohi>
2087
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2088
+ if (lohi) {
2089
+ return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
2090
+ } else {
2091
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2092
+ }
2093
+ }
2094
+ #else
2095
+ template <bool lohi>
2096
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2097
+ if (lohi) {
2098
+ return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
2099
+ } else {
2100
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2101
+ }
2102
+ }
2103
+
2104
+ template <bool lohi>
2105
+ EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2106
+ if (lohi) {
2107
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
2108
+ } else {
2109
+ return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2110
+ }
2111
+ }
2112
+ #endif
2113
+
2114
+ /**
2115
+ * Convert and pack two float Packets into one bfloat16 Packet
2116
+ *
2117
+ * @tparam lohi to expect either a low & high OR odd & even order
2118
+ */
2119
+ template <bool lohi = true>
2120
+ EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2121
+ Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2122
+ Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2123
+
2124
+ Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2125
+ EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2126
+ lsb = padd<Packet8us>(lsb, p8us_BIAS);
2127
+ lsb = padd<Packet8us>(lsb, p4f2);
2128
+
2129
+ Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2130
+ Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
2131
+
2132
+ #if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2133
+ Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2134
+ Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2135
+ Packet8us nan_selector =
2136
+ Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2137
+
2138
+ input = vec_sel(input, p8us_BIAS, nan_selector);
2139
+
2140
+ #ifdef SUPPORT_BF16_SUBNORMALS
2141
+ Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2142
+ Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2143
+ Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
2144
+ reinterpret_cast<Packet4f>(subnormal_selector_hi));
2145
+
2146
+ input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2147
+ #endif
2148
+ #else
2149
+ #ifdef SUPPORT_BF16_SUBNORMALS
2150
+ // Test NaN and Subnormal
2151
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2152
+ Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
2153
+
2154
+ const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2155
+ Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2156
+
2157
+ Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
2158
+ Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
2159
+
2160
+ Packet8us nan_selector =
2161
+ pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2162
+
2163
+ Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
2164
+
2165
+ Packet8us subnormal_selector =
2166
+ pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2167
+
2168
+ // Using BIAS as NaN (since any or all of the last 7 bits can be set)
2169
+ input = vec_sel(input, p8us_BIAS, nan_selector);
2170
+ input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2171
+ #else
2172
+ // Test only NaN
2173
+ Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2174
+ Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2175
+ Packet8us nan_selector =
2176
+ Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2177
+
2178
+ input = vec_sel(p8us_BIAS, input, nan_selector);
2179
+ #endif
2180
+ #endif
2181
+
2182
+ return input;
2183
+ }
2184
+
2185
+ /**
2186
+ * Convert and pack two float Packets into one bfloat16 Packet - low & high order
2187
+ */
2188
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2189
+ #ifdef _ARCH_PWR10
2190
+ Packet8bf fp16_0 = F32ToBf16(lo);
2191
+ Packet8bf fp16_1 = F32ToBf16(hi);
2192
+ return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
2193
+ #else
2194
+ return F32ToBf16Two(lo, hi);
2195
+ #endif
1290
2196
  }
1291
2197
 
1292
- EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
1293
- Packet4f bf_odd, bf_even;
1294
- bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
1295
- bf_odd = plogical_shift_left<16>(bf_odd);
1296
- bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
1297
- return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
2198
+ /**
2199
+ * Convert and pack two float Packets into one bfloat16 Packet - odd & even order
2200
+ */
2201
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2202
+ #ifdef _ARCH_PWR10
2203
+ return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
2204
+ #else
2205
+ return F32ToBf16Two<false>(even, odd);
2206
+ #endif
1298
2207
  }
1299
2208
  #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
1300
- Packet4f a_even = Bf16ToF32Even(A);\
1301
- Packet4f a_odd = Bf16ToF32Odd(A);\
1302
- Packet4f op_even = OP(a_even);\
1303
- Packet4f op_odd = OP(a_odd);\
1304
- return F32ToBf16(op_even, op_odd);\
2209
+ Packet4f a_even = Bf16ToF32Even(A); \
2210
+ Packet4f a_odd = Bf16ToF32Odd(A); \
2211
+ Packet4f op_even = OP(a_even); \
2212
+ Packet4f op_odd = OP(a_odd); \
2213
+ return F32ToBf16(op_even, op_odd);
1305
2214
 
1306
2215
  #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
1307
- Packet4f a_even = Bf16ToF32Even(A);\
1308
- Packet4f a_odd = Bf16ToF32Odd(A);\
1309
- Packet4f b_even = Bf16ToF32Even(B);\
1310
- Packet4f b_odd = Bf16ToF32Odd(B);\
1311
- Packet4f op_even = OP(a_even, b_even);\
1312
- Packet4f op_odd = OP(a_odd, b_odd);\
1313
- return F32ToBf16(op_even, op_odd);\
2216
+ Packet4f a_even = Bf16ToF32Even(A); \
2217
+ Packet4f a_odd = Bf16ToF32Odd(A); \
2218
+ Packet4f b_even = Bf16ToF32Even(B); \
2219
+ Packet4f b_odd = Bf16ToF32Odd(B); \
2220
+ Packet4f op_even = OP(a_even, b_even); \
2221
+ Packet4f op_odd = OP(a_odd, b_odd); \
2222
+ return F32ToBf16(op_even, op_odd);
1314
2223
 
1315
2224
  #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
1316
- Packet4f a_even = Bf16ToF32Even(A);\
1317
- Packet4f a_odd = Bf16ToF32Odd(A);\
1318
- Packet4f b_even = Bf16ToF32Even(B);\
1319
- Packet4f b_odd = Bf16ToF32Odd(B);\
1320
- Packet4f op_even = OP(a_even, b_even);\
1321
- Packet4f op_odd = OP(a_odd, b_odd);\
1322
- return F32ToBf16Bool(op_even, op_odd);\
1323
-
1324
- template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2225
+ Packet4f a_even = Bf16ToF32Even(A); \
2226
+ Packet4f a_odd = Bf16ToF32Odd(A); \
2227
+ Packet4f b_even = Bf16ToF32Even(B); \
2228
+ Packet4f b_odd = Bf16ToF32Odd(B); \
2229
+ Packet4f op_even = OP(a_even, b_even); \
2230
+ Packet4f op_odd = OP(a_odd, b_odd); \
2231
+ return F32ToBf16Bool(op_even, op_odd);
2232
+
2233
+ template <>
2234
+ EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1325
2235
  BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
1326
2236
  }
1327
2237
 
1328
- template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2238
+ template <>
2239
+ EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1329
2240
  BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
1330
2241
  }
1331
2242
 
1332
- template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2243
+ template <>
2244
+ EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1333
2245
  BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
1334
2246
  }
1335
2247
 
1336
- template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
1337
- BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
2248
+ template <>
2249
+ EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
2250
+ EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2251
+ return pxor<Packet8us>(p8us_neg_mask, a);
1338
2252
  }
1339
2253
 
1340
- template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2254
+ template <>
2255
+ EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1341
2256
  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
1342
2257
  }
1343
2258
 
1344
- template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
1345
- BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
1346
- }
1347
- template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
1348
- BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
1349
- }
1350
- template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
2259
+ template <>
2260
+ EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
1351
2261
  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
1352
2262
  }
1353
2263
 
1354
- template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1355
- return pldexp_generic(a,exponent);
2264
+ template <>
2265
+ EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
2266
+ BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2267
+ }
2268
+
2269
+ template <>
2270
+ EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2271
+ return pldexp_generic(a, exponent);
1356
2272
  }
1357
- template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
2273
+ template <>
2274
+ EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
1358
2275
  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
1359
2276
  }
1360
2277
 
1361
- template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1362
- return pfrexp_generic(a,exponent);
2278
+ template <>
2279
+ EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2280
+ return pfrexp_generic(a, exponent);
1363
2281
  }
1364
- template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
2282
+ template <>
2283
+ EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
1365
2284
  Packet4f a_even = Bf16ToF32Even(a);
1366
2285
  Packet4f a_odd = Bf16ToF32Odd(a);
1367
2286
  Packet4f e_even;
@@ -1372,28 +2291,42 @@ template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a,
1372
2291
  return F32ToBf16(op_even, op_odd);
1373
2292
  }
1374
2293
 
1375
- template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
2294
+ template <>
2295
+ EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
1376
2296
  BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
1377
2297
  }
1378
- template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
2298
+ template <>
2299
+ EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
1379
2300
  BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
1380
2301
  }
1381
- template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
2302
+ template <>
2303
+ EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
1382
2304
  BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
1383
2305
  }
1384
- template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
2306
+ template <>
2307
+ EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
1385
2308
  BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
1386
2309
  }
1387
- template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
2310
+ template <>
2311
+ EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
1388
2312
  BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
1389
2313
  }
1390
- template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
2314
+ template <>
2315
+ EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
1391
2316
  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
1392
2317
  }
1393
- template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
2318
+ template <>
2319
+ EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
2320
+ BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2321
+ }
2322
+ #ifdef EIGEN_VECTORIZE_VSX
2323
+ template <>
2324
+ EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
1394
2325
  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
1395
2326
  }
1396
- template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2327
+ #endif
2328
+ template <>
2329
+ EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
1397
2330
  Packet4f a_even = Bf16ToF32Even(a);
1398
2331
  Packet4f a_odd = Bf16ToF32Odd(a);
1399
2332
  Packet4f b_even = Bf16ToF32Even(b);
@@ -1405,147 +2338,191 @@ template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8
1405
2338
  return F32ToBf16(pmadd_even, pmadd_odd);
1406
2339
  }
1407
2340
 
1408
- template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2341
+ template <>
2342
+ EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2343
+ Packet4f a_even = Bf16ToF32Even(a);
2344
+ Packet4f a_odd = Bf16ToF32Odd(a);
2345
+ Packet4f b_even = Bf16ToF32Even(b);
2346
+ Packet4f b_odd = Bf16ToF32Odd(b);
2347
+ Packet4f c_even = Bf16ToF32Even(c);
2348
+ Packet4f c_odd = Bf16ToF32Odd(c);
2349
+ Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2350
+ Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2351
+ return F32ToBf16(pmadd_even, pmadd_odd);
2352
+ }
2353
+ template <>
2354
+ EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2355
+ Packet4f a_even = Bf16ToF32Even(a);
2356
+ Packet4f a_odd = Bf16ToF32Odd(a);
2357
+ Packet4f b_even = Bf16ToF32Even(b);
2358
+ Packet4f b_odd = Bf16ToF32Odd(b);
2359
+ Packet4f c_even = Bf16ToF32Even(c);
2360
+ Packet4f c_odd = Bf16ToF32Odd(c);
2361
+ Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2362
+ Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2363
+ return F32ToBf16(pmadd_even, pmadd_odd);
2364
+ }
2365
+
2366
+ template <>
2367
+ EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2368
+ Packet4f a_even = Bf16ToF32Even(a);
2369
+ Packet4f a_odd = Bf16ToF32Odd(a);
2370
+ Packet4f b_even = Bf16ToF32Even(b);
2371
+ Packet4f b_odd = Bf16ToF32Odd(b);
2372
+ Packet4f c_even = Bf16ToF32Even(c);
2373
+ Packet4f c_odd = Bf16ToF32Odd(c);
2374
+ Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2375
+ Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2376
+ return F32ToBf16(pmadd_even, pmadd_odd);
2377
+ }
2378
+
2379
+ template <>
2380
+ EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1409
2381
  BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
1410
2382
  }
1411
2383
 
1412
- template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2384
+ template <>
2385
+ EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1413
2386
  BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
1414
2387
  }
1415
2388
 
1416
- template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2389
+ template <>
2390
+ EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
1417
2391
  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
1418
2392
  }
1419
- template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2393
+ template <>
2394
+ EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
1420
2395
  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
1421
2396
  }
1422
- template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2397
+ template <>
2398
+ EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
1423
2399
  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
1424
2400
  }
1425
- template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2401
+ template <>
2402
+ EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
1426
2403
  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
1427
2404
  }
1428
2405
 
1429
- template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
2406
+ template <>
2407
+ EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
1430
2408
  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
1431
2409
  }
1432
2410
 
1433
- template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from)
1434
- {
2411
+ template <>
2412
+ EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
1435
2413
  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1436
2414
  }
1437
2415
 
1438
- template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
1439
- bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
1440
- bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
2416
+ template <>
2417
+ EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2418
+ bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2419
+ bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
1441
2420
  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
1442
2421
  }
1443
2422
 
1444
- template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
1445
- {
2423
+ template <>
2424
+ EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1446
2425
  Packet4f b, sum;
1447
- b = vec_sld(a, a, 8);
2426
+ b = vec_sld(a, a, 8);
1448
2427
  sum = a + b;
1449
- b = vec_sld(sum, sum, 4);
2428
+ b = vec_sld(sum, sum, 4);
1450
2429
  sum += b;
1451
2430
  return pfirst(sum);
1452
2431
  }
1453
2432
 
1454
- template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1455
- {
1456
- Packet4i sum;
1457
- sum = vec_sums(a, p4i_ZERO);
1458
- #ifdef _BIG_ENDIAN
1459
- sum = vec_sld(sum, p4i_ZERO, 12);
1460
- #else
1461
- sum = vec_sld(p4i_ZERO, sum, 4);
1462
- #endif
2433
+ template <>
2434
+ EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
2435
+ Packet4i b, sum;
2436
+ b = vec_sld(a, a, 8);
2437
+ sum = a + b;
2438
+ b = vec_sld(sum, sum, 4);
2439
+ sum += b;
1463
2440
  return pfirst(sum);
1464
2441
  }
1465
2442
 
1466
- template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
1467
- {
2443
+ template <>
2444
+ EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
1468
2445
  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
1469
- float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2446
+ float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
1470
2447
  float f32_result = redux_even + redux_odd;
1471
2448
  return bfloat16(f32_result);
1472
2449
  }
1473
- template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
1474
- {
1475
- union{
2450
+ template <typename Packet>
2451
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
2452
+ union {
1476
2453
  Packet v;
1477
2454
  __UNPACK_TYPE__(Packet) n[8];
1478
2455
  } vt;
1479
2456
  vt.v = a;
1480
2457
 
1481
- EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1482
- EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1483
- Packet4i first_half = pload<Packet4i>(first_loader);
2458
+ EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2459
+ EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2460
+ Packet4i first_half = pload<Packet4i>(first_loader);
1484
2461
  Packet4i second_half = pload<Packet4i>(second_loader);
1485
2462
 
1486
2463
  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
1487
2464
  }
1488
2465
 
1489
- template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
1490
- {
2466
+ template <>
2467
+ EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
1491
2468
  return predux_size8<Packet8s>(a);
1492
2469
  }
1493
2470
 
1494
- template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
1495
- {
2471
+ template <>
2472
+ EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
1496
2473
  return predux_size8<Packet8us>(a);
1497
2474
  }
1498
2475
 
1499
- template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
1500
- {
1501
- union{
2476
+ template <typename Packet>
2477
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
2478
+ union {
1502
2479
  Packet v;
1503
2480
  __UNPACK_TYPE__(Packet) n[16];
1504
2481
  } vt;
1505
2482
  vt.v = a;
1506
2483
 
1507
- EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1508
- EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1509
- EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
1510
- EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
2484
+ EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2485
+ EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2486
+ EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2487
+ EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
1511
2488
 
1512
2489
  Packet4i first_quarter = pload<Packet4i>(first_loader);
1513
2490
  Packet4i second_quarter = pload<Packet4i>(second_loader);
1514
2491
  Packet4i third_quarter = pload<Packet4i>(third_loader);
1515
2492
  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
1516
2493
 
1517
- return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
1518
- + predux(third_quarter) + predux(fourth_quarter));
2494
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2495
+ predux(fourth_quarter));
1519
2496
  }
1520
2497
 
1521
- template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
1522
- {
2498
+ template <>
2499
+ EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
1523
2500
  return predux_size16<Packet16c>(a);
1524
2501
  }
1525
2502
 
1526
- template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
1527
- {
2503
+ template <>
2504
+ EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
1528
2505
  return predux_size16<Packet16uc>(a);
1529
2506
  }
1530
2507
 
1531
2508
  // Other reduction functions:
1532
2509
  // mul
1533
- template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1534
- {
2510
+ template <>
2511
+ EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
1535
2512
  Packet4f prod;
1536
2513
  prod = pmul(a, vec_sld(a, a, 8));
1537
2514
  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1538
2515
  }
1539
2516
 
1540
- template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1541
- {
2517
+ template <>
2518
+ EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
1542
2519
  EIGEN_ALIGN16 int aux[4];
1543
2520
  pstore(aux, a);
1544
2521
  return aux[0] * aux[1] * aux[2] * aux[3];
1545
2522
  }
1546
2523
 
1547
- template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
1548
- {
2524
+ template <>
2525
+ EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
1549
2526
  Packet8s pair, quad, octo;
1550
2527
 
1551
2528
  pair = vec_mul(a, vec_sld(a, a, 8));
@@ -1555,8 +2532,8 @@ template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
1555
2532
  return pfirst(octo);
1556
2533
  }
1557
2534
 
1558
- template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
1559
- {
2535
+ template <>
2536
+ EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
1560
2537
  Packet8us pair, quad, octo;
1561
2538
 
1562
2539
  pair = vec_mul(a, vec_sld(a, a, 8));
@@ -1566,17 +2543,16 @@ template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Pa
1566
2543
  return pfirst(octo);
1567
2544
  }
1568
2545
 
1569
- template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
1570
- {
2546
+ template <>
2547
+ EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
1571
2548
  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
1572
- float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2549
+ float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
1573
2550
  float f32_result = redux_even * redux_odd;
1574
2551
  return bfloat16(f32_result);
1575
2552
  }
1576
2553
 
1577
-
1578
- template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
1579
- {
2554
+ template <>
2555
+ EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
1580
2556
  Packet16c pair, quad, octo, result;
1581
2557
 
1582
2558
  pair = vec_mul(a, vec_sld(a, a, 8));
@@ -1587,8 +2563,8 @@ template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c
1587
2563
  return pfirst(result);
1588
2564
  }
1589
2565
 
1590
- template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
1591
- {
2566
+ template <>
2567
+ EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
1592
2568
  Packet16uc pair, quad, octo, result;
1593
2569
 
1594
2570
  pair = vec_mul(a, vec_sld(a, a, 8));
@@ -1600,66 +2576,64 @@ template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet
1600
2576
  }
1601
2577
 
1602
2578
  // min
1603
- template<typename Packet> EIGEN_STRONG_INLINE
1604
- __UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
1605
- {
2579
+ template <typename Packet>
2580
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
1606
2581
  Packet b, res;
1607
2582
  b = vec_min(a, vec_sld(a, a, 8));
1608
2583
  res = vec_min(b, vec_sld(b, b, 4));
1609
2584
  return pfirst(res);
1610
2585
  }
1611
2586
 
1612
-
1613
- template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1614
- {
2587
+ template <>
2588
+ EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
1615
2589
  return predux_min4<Packet4f>(a);
1616
2590
  }
1617
2591
 
1618
- template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
1619
- {
2592
+ template <>
2593
+ EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
1620
2594
  return predux_min4<Packet4i>(a);
1621
2595
  }
1622
2596
 
1623
- template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
1624
- {
2597
+ template <>
2598
+ EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
1625
2599
  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
1626
- float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2600
+ float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
1627
2601
  float f32_result = (std::min)(redux_even, redux_odd);
1628
2602
  return bfloat16(f32_result);
1629
2603
  }
1630
2604
 
1631
- template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
1632
- {
2605
+ template <>
2606
+ EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
1633
2607
  Packet8s pair, quad, octo;
1634
-
1635
- //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1636
- pair = vec_min(a, vec_sld(a, a, 8));
1637
2608
 
1638
- //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2609
+ // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2610
+ pair = vec_min(a, vec_sld(a, a, 8));
2611
+
2612
+ // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1639
2613
  quad = vec_min(pair, vec_sld(pair, pair, 4));
1640
2614
 
1641
- //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2615
+ // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1642
2616
  octo = vec_min(quad, vec_sld(quad, quad, 2));
1643
2617
  return pfirst(octo);
1644
2618
  }
1645
2619
 
1646
- template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
1647
- {
2620
+ template <>
2621
+ EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
1648
2622
  Packet8us pair, quad, octo;
1649
-
1650
- //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1651
- pair = vec_min(a, vec_sld(a, a, 8));
1652
2623
 
1653
- //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2624
+ // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2625
+ pair = vec_min(a, vec_sld(a, a, 8));
2626
+
2627
+ // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1654
2628
  quad = vec_min(pair, vec_sld(pair, pair, 4));
1655
2629
 
1656
- //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2630
+ // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1657
2631
  octo = vec_min(quad, vec_sld(quad, quad, 2));
1658
2632
  return pfirst(octo);
1659
2633
  }
1660
2634
 
1661
- template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
1662
- {
2635
+ template <>
2636
+ EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
1663
2637
  Packet16c pair, quad, octo, result;
1664
2638
 
1665
2639
  pair = vec_min(a, vec_sld(a, a, 8));
@@ -1670,8 +2644,8 @@ template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c
1670
2644
  return pfirst(result);
1671
2645
  }
1672
2646
 
1673
- template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
1674
- {
2647
+ template <>
2648
+ EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
1675
2649
  Packet16uc pair, quad, octo, result;
1676
2650
 
1677
2651
  pair = vec_min(a, vec_sld(a, a, 8));
@@ -1682,64 +2656,64 @@ template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet
1682
2656
  return pfirst(result);
1683
2657
  }
1684
2658
  // max
1685
- template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
1686
- {
2659
+ template <typename Packet>
2660
+ EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
1687
2661
  Packet b, res;
1688
2662
  b = vec_max(a, vec_sld(a, a, 8));
1689
2663
  res = vec_max(b, vec_sld(b, b, 4));
1690
2664
  return pfirst(res);
1691
2665
  }
1692
2666
 
1693
- template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1694
- {
2667
+ template <>
2668
+ EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
1695
2669
  return predux_max4<Packet4f>(a);
1696
2670
  }
1697
2671
 
1698
- template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
1699
- {
2672
+ template <>
2673
+ EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
1700
2674
  return predux_max4<Packet4i>(a);
1701
2675
  }
1702
2676
 
1703
- template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
1704
- {
2677
+ template <>
2678
+ EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
1705
2679
  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
1706
- float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2680
+ float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
1707
2681
  float f32_result = (std::max)(redux_even, redux_odd);
1708
2682
  return bfloat16(f32_result);
1709
2683
  }
1710
2684
 
1711
- template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
1712
- {
2685
+ template <>
2686
+ EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
1713
2687
  Packet8s pair, quad, octo;
1714
-
1715
- //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1716
- pair = vec_max(a, vec_sld(a, a, 8));
1717
2688
 
1718
- //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2689
+ // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2690
+ pair = vec_max(a, vec_sld(a, a, 8));
2691
+
2692
+ // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1719
2693
  quad = vec_max(pair, vec_sld(pair, pair, 4));
1720
2694
 
1721
- //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2695
+ // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1722
2696
  octo = vec_max(quad, vec_sld(quad, quad, 2));
1723
2697
  return pfirst(octo);
1724
2698
  }
1725
2699
 
1726
- template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
1727
- {
2700
+ template <>
2701
+ EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
1728
2702
  Packet8us pair, quad, octo;
1729
-
1730
- //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1731
- pair = vec_max(a, vec_sld(a, a, 8));
1732
2703
 
1733
- //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2704
+ // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2705
+ pair = vec_max(a, vec_sld(a, a, 8));
2706
+
2707
+ // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1734
2708
  quad = vec_max(pair, vec_sld(pair, pair, 4));
1735
2709
 
1736
- //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2710
+ // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1737
2711
  octo = vec_max(quad, vec_sld(quad, quad, 2));
1738
2712
  return pfirst(octo);
1739
2713
  }
1740
2714
 
1741
- template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
1742
- {
2715
+ template <>
2716
+ EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
1743
2717
  Packet16c pair, quad, octo, result;
1744
2718
 
1745
2719
  pair = vec_max(a, vec_sld(a, a, 8));
@@ -1750,8 +2724,8 @@ template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c
1750
2724
  return pfirst(result);
1751
2725
  }
1752
2726
 
1753
- template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
1754
- {
2727
+ template <>
2728
+ EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
1755
2729
  Packet16uc pair, quad, octo, result;
1756
2730
 
1757
2731
  pair = vec_max(a, vec_sld(a, a, 8));
@@ -1762,13 +2736,13 @@ template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet
1762
2736
  return pfirst(result);
1763
2737
  }
1764
2738
 
1765
- template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
1766
- {
2739
+ template <>
2740
+ EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
1767
2741
  return vec_any_ne(x, pzero(x));
1768
2742
  }
1769
2743
 
1770
- template <typename T> EIGEN_DEVICE_FUNC inline void
1771
- ptranpose_common(PacketBlock<T,4>& kernel){
2744
+ template <typename T>
2745
+ EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
1772
2746
  T t0, t1, t2, t3;
1773
2747
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1774
2748
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -1780,18 +2754,11 @@ ptranpose_common(PacketBlock<T,4>& kernel){
1780
2754
  kernel.packet[3] = vec_mergel(t1, t3);
1781
2755
  }
1782
2756
 
1783
- EIGEN_DEVICE_FUNC inline void
1784
- ptranspose(PacketBlock<Packet4f,4>& kernel) {
1785
- ptranpose_common<Packet4f>(kernel);
1786
- }
2757
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
1787
2758
 
1788
- EIGEN_DEVICE_FUNC inline void
1789
- ptranspose(PacketBlock<Packet4i,4>& kernel) {
1790
- ptranpose_common<Packet4i>(kernel);
1791
- }
2759
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
1792
2760
 
1793
- EIGEN_DEVICE_FUNC inline void
1794
- ptranspose(PacketBlock<Packet8s,4>& kernel) {
2761
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
1795
2762
  Packet8s t0, t1, t2, t3;
1796
2763
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1797
2764
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -1803,8 +2770,7 @@ ptranspose(PacketBlock<Packet8s,4>& kernel) {
1803
2770
  kernel.packet[3] = vec_mergel(t1, t3);
1804
2771
  }
1805
2772
 
1806
- EIGEN_DEVICE_FUNC inline void
1807
- ptranspose(PacketBlock<Packet8us,4>& kernel) {
2773
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
1808
2774
  Packet8us t0, t1, t2, t3;
1809
2775
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1810
2776
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -1816,9 +2782,7 @@ ptranspose(PacketBlock<Packet8us,4>& kernel) {
1816
2782
  kernel.packet[3] = vec_mergel(t1, t3);
1817
2783
  }
1818
2784
 
1819
-
1820
- EIGEN_DEVICE_FUNC inline void
1821
- ptranspose(PacketBlock<Packet8bf,4>& kernel) {
2785
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
1822
2786
  Packet8us t0, t1, t2, t3;
1823
2787
 
1824
2788
  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
@@ -1831,8 +2795,7 @@ ptranspose(PacketBlock<Packet8bf,4>& kernel) {
1831
2795
  kernel.packet[3] = vec_mergel(t1, t3);
1832
2796
  }
1833
2797
 
1834
- EIGEN_DEVICE_FUNC inline void
1835
- ptranspose(PacketBlock<Packet16c,4>& kernel) {
2798
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
1836
2799
  Packet16c t0, t1, t2, t3;
1837
2800
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1838
2801
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -1844,9 +2807,7 @@ ptranspose(PacketBlock<Packet16c,4>& kernel) {
1844
2807
  kernel.packet[3] = vec_mergel(t1, t3);
1845
2808
  }
1846
2809
 
1847
-
1848
- EIGEN_DEVICE_FUNC inline void
1849
- ptranspose(PacketBlock<Packet16uc,4>& kernel) {
2810
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
1850
2811
  Packet16uc t0, t1, t2, t3;
1851
2812
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1852
2813
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -1858,8 +2819,7 @@ ptranspose(PacketBlock<Packet16uc,4>& kernel) {
1858
2819
  kernel.packet[3] = vec_mergel(t1, t3);
1859
2820
  }
1860
2821
 
1861
- EIGEN_DEVICE_FUNC inline void
1862
- ptranspose(PacketBlock<Packet8s,8>& kernel) {
2822
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
1863
2823
  Packet8s v[8], sum[8];
1864
2824
 
1865
2825
  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
@@ -1889,8 +2849,7 @@ ptranspose(PacketBlock<Packet8s,8>& kernel) {
1889
2849
  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1890
2850
  }
1891
2851
 
1892
- EIGEN_DEVICE_FUNC inline void
1893
- ptranspose(PacketBlock<Packet8us,8>& kernel) {
2852
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
1894
2853
  Packet8us v[8], sum[8];
1895
2854
 
1896
2855
  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
@@ -1920,8 +2879,7 @@ ptranspose(PacketBlock<Packet8us,8>& kernel) {
1920
2879
  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1921
2880
  }
1922
2881
 
1923
- EIGEN_DEVICE_FUNC inline void
1924
- ptranspose(PacketBlock<Packet8bf,8>& kernel) {
2882
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
1925
2883
  Packet8bf v[8], sum[8];
1926
2884
 
1927
2885
  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
@@ -1951,8 +2909,7 @@ ptranspose(PacketBlock<Packet8bf,8>& kernel) {
1951
2909
  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
1952
2910
  }
1953
2911
 
1954
- EIGEN_DEVICE_FUNC inline void
1955
- ptranspose(PacketBlock<Packet16c,16>& kernel) {
2912
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
1956
2913
  Packet16c step1[16], step2[16], step3[16];
1957
2914
 
1958
2915
  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
@@ -1972,16 +2929,16 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
1972
2929
  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
1973
2930
  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
1974
2931
 
1975
- step2[0] = vec_mergeh(step1[0], step1[8]);
1976
- step2[1] = vec_mergel(step1[0], step1[8]);
1977
- step2[2] = vec_mergeh(step1[1], step1[9]);
1978
- step2[3] = vec_mergel(step1[1], step1[9]);
1979
- step2[4] = vec_mergeh(step1[2], step1[10]);
1980
- step2[5] = vec_mergel(step1[2], step1[10]);
1981
- step2[6] = vec_mergeh(step1[3], step1[11]);
1982
- step2[7] = vec_mergel(step1[3], step1[11]);
1983
- step2[8] = vec_mergeh(step1[4], step1[12]);
1984
- step2[9] = vec_mergel(step1[4], step1[12]);
2932
+ step2[0] = vec_mergeh(step1[0], step1[8]);
2933
+ step2[1] = vec_mergel(step1[0], step1[8]);
2934
+ step2[2] = vec_mergeh(step1[1], step1[9]);
2935
+ step2[3] = vec_mergel(step1[1], step1[9]);
2936
+ step2[4] = vec_mergeh(step1[2], step1[10]);
2937
+ step2[5] = vec_mergel(step1[2], step1[10]);
2938
+ step2[6] = vec_mergeh(step1[3], step1[11]);
2939
+ step2[7] = vec_mergel(step1[3], step1[11]);
2940
+ step2[8] = vec_mergeh(step1[4], step1[12]);
2941
+ step2[9] = vec_mergel(step1[4], step1[12]);
1985
2942
  step2[10] = vec_mergeh(step1[5], step1[13]);
1986
2943
  step2[11] = vec_mergel(step1[5], step1[13]);
1987
2944
  step2[12] = vec_mergeh(step1[6], step1[14]);
@@ -1989,16 +2946,16 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
1989
2946
  step2[14] = vec_mergeh(step1[7], step1[15]);
1990
2947
  step2[15] = vec_mergel(step1[7], step1[15]);
1991
2948
 
1992
- step3[0] = vec_mergeh(step2[0], step2[8]);
1993
- step3[1] = vec_mergel(step2[0], step2[8]);
1994
- step3[2] = vec_mergeh(step2[1], step2[9]);
1995
- step3[3] = vec_mergel(step2[1], step2[9]);
1996
- step3[4] = vec_mergeh(step2[2], step2[10]);
1997
- step3[5] = vec_mergel(step2[2], step2[10]);
1998
- step3[6] = vec_mergeh(step2[3], step2[11]);
1999
- step3[7] = vec_mergel(step2[3], step2[11]);
2000
- step3[8] = vec_mergeh(step2[4], step2[12]);
2001
- step3[9] = vec_mergel(step2[4], step2[12]);
2949
+ step3[0] = vec_mergeh(step2[0], step2[8]);
2950
+ step3[1] = vec_mergel(step2[0], step2[8]);
2951
+ step3[2] = vec_mergeh(step2[1], step2[9]);
2952
+ step3[3] = vec_mergel(step2[1], step2[9]);
2953
+ step3[4] = vec_mergeh(step2[2], step2[10]);
2954
+ step3[5] = vec_mergel(step2[2], step2[10]);
2955
+ step3[6] = vec_mergeh(step2[3], step2[11]);
2956
+ step3[7] = vec_mergel(step2[3], step2[11]);
2957
+ step3[8] = vec_mergeh(step2[4], step2[12]);
2958
+ step3[9] = vec_mergel(step2[4], step2[12]);
2002
2959
  step3[10] = vec_mergeh(step2[5], step2[13]);
2003
2960
  step3[11] = vec_mergel(step2[5], step2[13]);
2004
2961
  step3[12] = vec_mergeh(step2[6], step2[14]);
@@ -2006,16 +2963,16 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
2006
2963
  step3[14] = vec_mergeh(step2[7], step2[15]);
2007
2964
  step3[15] = vec_mergel(step2[7], step2[15]);
2008
2965
 
2009
- kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2010
- kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2011
- kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2012
- kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2013
- kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2014
- kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2015
- kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2016
- kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2017
- kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2018
- kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2966
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2967
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2968
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2969
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2970
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2971
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2972
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2973
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2974
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2975
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2019
2976
  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2020
2977
  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2021
2978
  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
@@ -2024,8 +2981,7 @@ ptranspose(PacketBlock<Packet16c,16>& kernel) {
2024
2981
  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2025
2982
  }
2026
2983
 
2027
- EIGEN_DEVICE_FUNC inline void
2028
- ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2984
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2029
2985
  Packet16uc step1[16], step2[16], step3[16];
2030
2986
 
2031
2987
  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
@@ -2045,16 +3001,16 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2045
3001
  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2046
3002
  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2047
3003
 
2048
- step2[0] = vec_mergeh(step1[0], step1[8]);
2049
- step2[1] = vec_mergel(step1[0], step1[8]);
2050
- step2[2] = vec_mergeh(step1[1], step1[9]);
2051
- step2[3] = vec_mergel(step1[1], step1[9]);
2052
- step2[4] = vec_mergeh(step1[2], step1[10]);
2053
- step2[5] = vec_mergel(step1[2], step1[10]);
2054
- step2[6] = vec_mergeh(step1[3], step1[11]);
2055
- step2[7] = vec_mergel(step1[3], step1[11]);
2056
- step2[8] = vec_mergeh(step1[4], step1[12]);
2057
- step2[9] = vec_mergel(step1[4], step1[12]);
3004
+ step2[0] = vec_mergeh(step1[0], step1[8]);
3005
+ step2[1] = vec_mergel(step1[0], step1[8]);
3006
+ step2[2] = vec_mergeh(step1[1], step1[9]);
3007
+ step2[3] = vec_mergel(step1[1], step1[9]);
3008
+ step2[4] = vec_mergeh(step1[2], step1[10]);
3009
+ step2[5] = vec_mergel(step1[2], step1[10]);
3010
+ step2[6] = vec_mergeh(step1[3], step1[11]);
3011
+ step2[7] = vec_mergel(step1[3], step1[11]);
3012
+ step2[8] = vec_mergeh(step1[4], step1[12]);
3013
+ step2[9] = vec_mergel(step1[4], step1[12]);
2058
3014
  step2[10] = vec_mergeh(step1[5], step1[13]);
2059
3015
  step2[11] = vec_mergel(step1[5], step1[13]);
2060
3016
  step2[12] = vec_mergeh(step1[6], step1[14]);
@@ -2062,16 +3018,16 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2062
3018
  step2[14] = vec_mergeh(step1[7], step1[15]);
2063
3019
  step2[15] = vec_mergel(step1[7], step1[15]);
2064
3020
 
2065
- step3[0] = vec_mergeh(step2[0], step2[8]);
2066
- step3[1] = vec_mergel(step2[0], step2[8]);
2067
- step3[2] = vec_mergeh(step2[1], step2[9]);
2068
- step3[3] = vec_mergel(step2[1], step2[9]);
2069
- step3[4] = vec_mergeh(step2[2], step2[10]);
2070
- step3[5] = vec_mergel(step2[2], step2[10]);
2071
- step3[6] = vec_mergeh(step2[3], step2[11]);
2072
- step3[7] = vec_mergel(step2[3], step2[11]);
2073
- step3[8] = vec_mergeh(step2[4], step2[12]);
2074
- step3[9] = vec_mergel(step2[4], step2[12]);
3021
+ step3[0] = vec_mergeh(step2[0], step2[8]);
3022
+ step3[1] = vec_mergel(step2[0], step2[8]);
3023
+ step3[2] = vec_mergeh(step2[1], step2[9]);
3024
+ step3[3] = vec_mergel(step2[1], step2[9]);
3025
+ step3[4] = vec_mergeh(step2[2], step2[10]);
3026
+ step3[5] = vec_mergel(step2[2], step2[10]);
3027
+ step3[6] = vec_mergeh(step2[3], step2[11]);
3028
+ step3[7] = vec_mergel(step2[3], step2[11]);
3029
+ step3[8] = vec_mergeh(step2[4], step2[12]);
3030
+ step3[9] = vec_mergel(step2[4], step2[12]);
2075
3031
  step3[10] = vec_mergeh(step2[5], step2[13]);
2076
3032
  step3[11] = vec_mergel(step2[5], step2[13]);
2077
3033
  step3[12] = vec_mergeh(step2[6], step2[14]);
@@ -2079,16 +3035,16 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2079
3035
  step3[14] = vec_mergeh(step2[7], step2[15]);
2080
3036
  step3[15] = vec_mergel(step2[7], step2[15]);
2081
3037
 
2082
- kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2083
- kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2084
- kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2085
- kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2086
- kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2087
- kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2088
- kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2089
- kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2090
- kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2091
- kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3038
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3039
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3040
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3041
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3042
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3043
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3044
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3045
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3046
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3047
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2092
3048
  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2093
3049
  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2094
3050
  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
@@ -2097,229 +3053,173 @@ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2097
3053
  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2098
3054
  }
2099
3055
 
2100
- template<typename Packet> EIGEN_STRONG_INLINE
2101
- Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
2102
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
2103
- Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
3056
+ template <typename Packet>
3057
+ EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
3058
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
3059
+ Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
2104
3060
  return vec_sel(elsePacket, thenPacket, mask);
2105
3061
  }
2106
3062
 
2107
- template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
3063
+ template <>
3064
+ EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
3065
+ const Packet4i& elsePacket) {
2108
3066
  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
2109
3067
  }
2110
3068
 
2111
- template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
3069
+ template <>
3070
+ EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
3071
+ const Packet4f& elsePacket) {
2112
3072
  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
2113
3073
  }
2114
3074
 
2115
- template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
2116
- Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2117
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2118
- Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
3075
+ template <>
3076
+ EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
3077
+ const Packet8s& elsePacket) {
3078
+ Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3079
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3080
+ Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
2119
3081
  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
2120
3082
  return result;
2121
3083
  }
2122
3084
 
2123
- template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
2124
- Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2125
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2126
- Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
3085
+ template <>
3086
+ EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
3087
+ const Packet8us& elsePacket) {
3088
+ Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3089
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3090
+ Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
2127
3091
  return vec_sel(elsePacket, thenPacket, mask);
2128
3092
  }
2129
3093
 
2130
- template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
3094
+ template <>
3095
+ EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
3096
+ const Packet8bf& elsePacket) {
2131
3097
  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
2132
3098
  }
2133
3099
 
2134
- template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
2135
- Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2136
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2137
- ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2138
- ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2139
-
2140
- Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
2141
- return vec_sel(elsePacket, thenPacket, mask);
2142
- }
2143
-
2144
- template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
2145
- Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2146
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2147
- ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2148
- ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2149
-
2150
- Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
3100
+ template <>
3101
+ EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
3102
+ const Packet16c& elsePacket) {
3103
+ Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3104
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3105
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3106
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3107
+
3108
+ Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
2151
3109
  return vec_sel(elsePacket, thenPacket, mask);
2152
3110
  }
2153
3111
 
2154
3112
  template <>
2155
- struct type_casting_traits<float, int> {
2156
- enum {
2157
- VectorizedCast = 1,
2158
- SrcCoeffRatio = 1,
2159
- TgtCoeffRatio = 1
2160
- };
2161
- };
2162
-
2163
- template <>
2164
- struct type_casting_traits<int, float> {
2165
- enum {
2166
- VectorizedCast = 1,
2167
- SrcCoeffRatio = 1,
2168
- TgtCoeffRatio = 1
2169
- };
2170
- };
2171
-
2172
- template <>
2173
- struct type_casting_traits<bfloat16, unsigned short int> {
2174
- enum {
2175
- VectorizedCast = 1,
2176
- SrcCoeffRatio = 1,
2177
- TgtCoeffRatio = 1
2178
- };
2179
- };
2180
-
2181
- template <>
2182
- struct type_casting_traits<unsigned short int, bfloat16> {
2183
- enum {
2184
- VectorizedCast = 1,
2185
- SrcCoeffRatio = 1,
2186
- TgtCoeffRatio = 1
2187
- };
2188
- };
2189
-
2190
- template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
2191
- return vec_cts(a,0);
2192
- }
2193
-
2194
- template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
2195
- return vec_ctu(a,0);
2196
- }
2197
-
2198
- template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
2199
- return vec_ctf(a,0);
2200
- }
2201
-
2202
- template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
2203
- return vec_ctf(a,0);
2204
- }
2205
-
2206
- template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
2207
- Packet4f float_even = Bf16ToF32Even(a);
2208
- Packet4f float_odd = Bf16ToF32Odd(a);
2209
- Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
2210
- Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
2211
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2212
- Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
2213
- Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
2214
-
2215
- //Check values that are bigger than USHRT_MAX (0xFFFF)
2216
- Packet4bi overflow_selector;
2217
- if(vec_any_gt(int_even, p4ui_low_mask)){
2218
- overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
2219
- low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2220
- }
2221
- if(vec_any_gt(int_odd, p4ui_low_mask)){
2222
- overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
2223
- low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2224
- }
2225
-
2226
- low_odd = plogical_shift_left<16>(low_odd);
2227
-
2228
- Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
2229
- return reinterpret_cast<Packet8us>(int_final);
2230
- }
2231
-
2232
- template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
2233
- //short -> int -> float -> bfloat16
2234
- const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2235
- Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
2236
- Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
2237
- Packet4ui int_odd = plogical_shift_right<16>(int_cast);
2238
- Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
2239
- Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
2240
- return F32ToBf16(float_even, float_odd);
2241
- }
2242
-
2243
-
2244
- template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
2245
- return reinterpret_cast<Packet4i>(a);
2246
- }
2247
-
2248
- template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
2249
- return reinterpret_cast<Packet4f>(a);
3113
+ EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
3114
+ const Packet16uc& elsePacket) {
3115
+ Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3116
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3117
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3118
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3119
+
3120
+ Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
3121
+ return vec_sel(elsePacket, thenPacket, mask);
2250
3122
  }
2251
3123
 
2252
-
2253
-
2254
3124
  //---------- double ----------
2255
- #ifdef __VSX__
2256
- typedef __vector double Packet2d;
2257
- typedef __vector unsigned long long Packet2ul;
2258
- typedef __vector long long Packet2l;
3125
+ #ifdef EIGEN_VECTORIZE_VSX
3126
+ typedef __vector double Packet2d;
3127
+ typedef __vector unsigned long long Packet2ul;
3128
+ typedef __vector long long Packet2l;
2259
3129
  #if EIGEN_COMP_CLANG
2260
- typedef Packet2ul Packet2bl;
3130
+ typedef Packet2ul Packet2bl;
2261
3131
  #else
2262
- typedef __vector __bool long Packet2bl;
3132
+ typedef __vector __bool long Packet2bl;
2263
3133
  #endif
2264
3134
 
2265
- static Packet2l p2l_ONE = { 1, 1 };
2266
- static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
2267
- static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
2268
- static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
2269
- static Packet2d p2d_ONE = { 1.0, 1.0 };
2270
- static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
2271
- static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
2272
- numext::bit_cast<double>(0x8000000000000000ull) };
3135
+ static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
3136
+ static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3137
+ static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3138
+ static Packet2d p2d_ONE = {1.0, 1.0};
3139
+ static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
3140
+ static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3141
+ numext::bit_cast<double>(0x8000000000000000ull)};
2273
3142
 
2274
3143
  #ifdef _BIG_ENDIAN
2275
- static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
3144
+ static Packet2d p2d_COUNTDOWN =
3145
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
2276
3146
  #else
2277
- static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
3147
+ static Packet2d p2d_COUNTDOWN =
3148
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
2278
3149
  #endif
2279
3150
 
2280
- template<int index> Packet2d vec_splat_dbl(Packet2d& a)
2281
- {
3151
+ template <int index>
3152
+ Packet2d vec_splat_dbl(Packet2d& a) {
2282
3153
  return vec_splat(a, index);
2283
3154
  }
2284
3155
 
2285
- template<> struct packet_traits<double> : default_packet_traits
2286
- {
3156
+ template <>
3157
+ struct packet_traits<double> : default_packet_traits {
2287
3158
  typedef Packet2d type;
2288
3159
  typedef Packet2d half;
2289
3160
  enum {
2290
3161
  Vectorizable = 1,
2291
3162
  AlignedOnScalar = 1,
2292
- size=2,
2293
- HasHalfPacket = 1,
2294
-
2295
- HasAdd = 1,
2296
- HasSub = 1,
2297
- HasMul = 1,
2298
- HasDiv = 1,
2299
- HasMin = 1,
2300
- HasMax = 1,
2301
- HasAbs = 1,
2302
- HasSin = 0,
2303
- HasCos = 0,
2304
- HasLog = 0,
2305
- HasExp = 1,
3163
+ size = 2,
3164
+
3165
+ HasAdd = 1,
3166
+ HasSub = 1,
3167
+ HasMul = 1,
3168
+ HasDiv = 1,
3169
+ HasMin = 1,
3170
+ HasMax = 1,
3171
+ HasAbs = 1,
3172
+ HasSin = EIGEN_FAST_MATH,
3173
+ HasCos = EIGEN_FAST_MATH,
3174
+ HasTanh = EIGEN_FAST_MATH,
3175
+ HasErf = EIGEN_FAST_MATH,
3176
+ HasErfc = EIGEN_FAST_MATH,
3177
+ HasATanh = 1,
3178
+ HasATan = 0,
3179
+ HasLog = 0,
3180
+ HasCmp = 1,
3181
+ HasExp = 1,
2306
3182
  HasSqrt = 1,
3183
+ HasCbrt = 1,
3184
+ #if !EIGEN_COMP_CLANG
2307
3185
  HasRsqrt = 1,
2308
- HasRound = 1,
2309
- HasFloor = 1,
2310
- HasCeil = 1,
2311
- HasRint = 1,
3186
+ #else
3187
+ HasRsqrt = 0,
3188
+ #endif
2312
3189
  HasNegate = 1,
2313
3190
  HasBlend = 1
2314
3191
  };
2315
3192
  };
2316
3193
 
2317
- template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
3194
+ template <>
3195
+ struct unpacket_traits<Packet2d> {
3196
+ typedef double type;
3197
+ typedef Packet2l integer_packet;
3198
+ enum {
3199
+ size = 2,
3200
+ alignment = Aligned16,
3201
+ vectorizable = true,
3202
+ masked_load_available = false,
3203
+ masked_store_available = false
3204
+ };
3205
+ typedef Packet2d half;
3206
+ };
3207
+ template <>
3208
+ struct unpacket_traits<Packet2l> {
3209
+ typedef int64_t type;
3210
+ typedef Packet2l half;
3211
+ enum {
3212
+ size = 2,
3213
+ alignment = Aligned16,
3214
+ vectorizable = false,
3215
+ masked_load_available = false,
3216
+ masked_store_available = false
3217
+ };
3218
+ };
2318
3219
 
2319
- inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
2320
- {
3220
+ inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
2321
3221
  union {
2322
- Packet2l v;
3222
+ Packet2l v;
2323
3223
  int64_t n[2];
2324
3224
  } vt;
2325
3225
  vt.v = v;
@@ -2327,10 +3227,9 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
2327
3227
  return s;
2328
3228
  }
2329
3229
 
2330
- inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
2331
- {
3230
+ inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
2332
3231
  union {
2333
- Packet2d v;
3232
+ Packet2d v;
2334
3233
  double n[2];
2335
3234
  } vt;
2336
3235
  vt.v = v;
@@ -2339,204 +3238,322 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
2339
3238
  }
2340
3239
 
2341
3240
  // Need to define them first or we get specialization after instantiation errors
2342
- template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
2343
- {
3241
+ template <>
3242
+ EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
2344
3243
  EIGEN_DEBUG_ALIGNED_LOAD
2345
- return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
3244
+ return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
3245
+ }
3246
+
3247
+ template <>
3248
+ EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3249
+ return pload_partial_common<Packet2d>(from, n, offset);
2346
3250
  }
2347
3251
 
2348
- template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
2349
- {
3252
+ template <>
3253
+ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
2350
3254
  EIGEN_DEBUG_ALIGNED_STORE
2351
3255
  vec_xst(from, 0, to);
2352
3256
  }
2353
3257
 
2354
- template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
3258
+ template <>
3259
+ EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3260
+ pstore_partial_common<Packet2d>(to, from, n, offset);
3261
+ }
3262
+
3263
+ template <>
3264
+ EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
2355
3265
  Packet2d v = {from, from};
2356
3266
  return v;
2357
3267
  }
3268
+ template <>
3269
+ EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
3270
+ Packet2l v = {from, from};
3271
+ return v;
3272
+ }
2358
3273
 
2359
- template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
3274
+ template <>
3275
+ EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
2360
3276
  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
2361
3277
  return reinterpret_cast<Packet2d>(v);
2362
3278
  }
2363
3279
 
2364
- template<> EIGEN_STRONG_INLINE void
2365
- pbroadcast4<Packet2d>(const double *a,
2366
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
2367
- {
2368
- //This way is faster than vec_splat (at least for doubles in Power 9)
3280
+ template <>
3281
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3282
+ Packet2d& a3) {
3283
+ // This way is faster than vec_splat (at least for doubles in Power 9)
2369
3284
  a0 = pset1<Packet2d>(a[0]);
2370
3285
  a1 = pset1<Packet2d>(a[1]);
2371
3286
  a2 = pset1<Packet2d>(a[2]);
2372
3287
  a3 = pset1<Packet2d>(a[3]);
2373
3288
  }
2374
3289
 
2375
- template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
2376
- {
2377
- EIGEN_ALIGN16 double af[2];
2378
- af[0] = from[0*stride];
2379
- af[1] = from[1*stride];
2380
- return pload<Packet2d>(af);
3290
+ template <>
3291
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
3292
+ return pgather_common<Packet2d>(from, stride);
3293
+ }
3294
+ template <>
3295
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
3296
+ const Index n) {
3297
+ return pgather_common<Packet2d>(from, stride, n);
3298
+ }
3299
+ template <>
3300
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
3301
+ pscatter_common<Packet2d>(to, from, stride);
2381
3302
  }
2382
- template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
2383
- {
2384
- EIGEN_ALIGN16 double af[2];
2385
- pstore<double>(af, from);
2386
- to[0*stride] = af[0];
2387
- to[1*stride] = af[1];
3303
+ template <>
3304
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
3305
+ Index stride, const Index n) {
3306
+ pscatter_common<Packet2d>(to, from, stride, n);
2388
3307
  }
2389
3308
 
2390
- template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
3309
+ template <>
3310
+ EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
3311
+ return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3312
+ }
2391
3313
 
2392
- template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
3314
+ template <>
3315
+ EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
3316
+ return a + b;
3317
+ }
2393
3318
 
2394
- template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
3319
+ template <>
3320
+ EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
3321
+ return a - b;
3322
+ }
2395
3323
 
2396
- template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
3324
+ template <>
3325
+ EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
3326
+ #ifdef __POWER8_VECTOR__
3327
+ return vec_neg(a);
3328
+ #else
3329
+ return vec_xor(a, p2d_MZERO);
3330
+ #endif
3331
+ }
2397
3332
 
2398
- template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
3333
+ template <>
3334
+ EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
3335
+ return a;
3336
+ }
2399
3337
 
2400
- template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
2401
- template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
3338
+ template <>
3339
+ EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
3340
+ return vec_madd(a, b, p2d_MZERO);
3341
+ }
3342
+ template <>
3343
+ EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
3344
+ return vec_div(a, b);
3345
+ }
2402
3346
 
2403
3347
  // for some weird raisons, it has to be overloaded for packet of integers
2404
- template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
3348
+ template <>
3349
+ EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3350
+ return vec_madd(a, b, c);
3351
+ }
3352
+ template <>
3353
+ EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3354
+ return vec_msub(a, b, c);
3355
+ }
3356
+ template <>
3357
+ EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3358
+ return vec_nmsub(a, b, c);
3359
+ }
3360
+ template <>
3361
+ EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3362
+ return vec_nmadd(a, b, c);
3363
+ }
2405
3364
 
2406
- template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
2407
- {
3365
+ template <>
3366
+ EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
2408
3367
  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
2409
3368
  Packet2d ret;
2410
- __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
3369
+ __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
2411
3370
  return ret;
2412
- }
3371
+ }
2413
3372
 
2414
- template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
2415
- {
3373
+ template <>
3374
+ EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
2416
3375
  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
2417
3376
  Packet2d ret;
2418
- __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
3377
+ __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
2419
3378
  return ret;
2420
3379
  }
2421
3380
 
2422
- template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
2423
- template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
2424
- template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
2425
- template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
2426
- Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
2427
- return vec_nor(c,c);
3381
+ template <>
3382
+ EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
3383
+ return reinterpret_cast<Packet2d>(vec_cmple(a, b));
3384
+ }
3385
+ template <>
3386
+ EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
3387
+ return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
3388
+ }
3389
+ template <>
3390
+ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
3391
+ return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
3392
+ }
3393
+ template <>
3394
+ #ifdef __POWER8_VECTOR__
3395
+ EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3396
+ return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
3397
+ }
3398
+ #else
3399
+ EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3400
+ Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
3401
+ Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3402
+ return reinterpret_cast<Packet2l>(pand(halves, flipped));
3403
+ }
3404
+ #endif
3405
+ template <>
3406
+ EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
3407
+ Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
3408
+ return vec_nor(c, c);
2428
3409
  }
2429
3410
 
2430
- template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
3411
+ template <>
3412
+ EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
3413
+ return vec_and(a, b);
3414
+ }
2431
3415
 
2432
- template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
3416
+ template <>
3417
+ EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
3418
+ return vec_or(a, b);
3419
+ }
2433
3420
 
2434
- template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
3421
+ template <>
3422
+ EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
3423
+ return vec_xor(a, b);
3424
+ }
2435
3425
 
2436
- template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
3426
+ template <>
3427
+ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
3428
+ return vec_and(a, vec_nor(b, b));
3429
+ }
2437
3430
 
2438
- template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
2439
- {
2440
- Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
2441
- Packet2d res;
3431
+ template <>
3432
+ EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
3433
+ Packet2d t = vec_add(
3434
+ reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3435
+ Packet2d res;
2442
3436
 
2443
- __asm__("xvrdpiz %x0, %x1\n\t"
2444
- : "=&wa" (res)
2445
- : "wa" (t));
3437
+ __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
2446
3438
 
2447
- return res;
3439
+ return res;
3440
+ }
3441
+ template <>
3442
+ EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
3443
+ return vec_ceil(a);
3444
+ }
3445
+ template <>
3446
+ EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
3447
+ return vec_floor(a);
3448
+ }
3449
+ template <>
3450
+ EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
3451
+ return vec_trunc(a);
2448
3452
  }
2449
- template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
2450
- template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
2451
- template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
2452
- {
2453
- Packet2d res;
3453
+ template <>
3454
+ EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
3455
+ Packet2d res;
2454
3456
 
2455
- __asm__("xvrdpic %x0, %x1\n\t"
2456
- : "=&wa" (res)
2457
- : "wa" (a));
3457
+ __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
2458
3458
 
2459
- return res;
3459
+ return res;
2460
3460
  }
2461
3461
 
2462
- template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
2463
- {
3462
+ template <>
3463
+ EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
2464
3464
  EIGEN_DEBUG_UNALIGNED_LOAD
2465
3465
  return vec_xl(0, const_cast<double*>(from));
2466
3466
  }
2467
3467
 
2468
- template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
2469
- {
3468
+ template <>
3469
+ EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3470
+ return ploadu_partial_common<Packet2d>(from, n, offset);
3471
+ }
3472
+
3473
+ template <>
3474
+ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
2470
3475
  Packet2d p;
2471
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
2472
- else p = ploadu<Packet2d>(from);
3476
+ if ((std::ptrdiff_t(from) % 16) == 0)
3477
+ p = pload<Packet2d>(from);
3478
+ else
3479
+ p = ploadu<Packet2d>(from);
2473
3480
  return vec_splat_dbl<0>(p);
2474
3481
  }
2475
3482
 
2476
- template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
2477
- {
3483
+ template <>
3484
+ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
2478
3485
  EIGEN_DEBUG_UNALIGNED_STORE
2479
3486
  vec_xst(from, 0, to);
2480
3487
  }
2481
3488
 
2482
- template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
3489
+ template <>
3490
+ EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3491
+ pstoreu_partial_common<Packet2d>(to, from, n, offset);
3492
+ }
2483
3493
 
2484
- template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
3494
+ template <>
3495
+ EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
3496
+ EIGEN_PPC_PREFETCH(addr);
3497
+ }
2485
3498
 
2486
- template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
2487
- {
2488
- return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
3499
+ template <>
3500
+ EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
3501
+ EIGEN_ALIGN16 double x[2];
3502
+ pstore<double>(x, a);
3503
+ return x[0];
2489
3504
  }
2490
- template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
2491
3505
 
2492
- // VSX support varies between different compilers and even different
2493
- // versions of the same compiler. For gcc version >= 4.9.3, we can use
2494
- // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
2495
- // a slow version that works with older compilers.
2496
- // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
2497
- // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
2498
- template<>
2499
- inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
2500
- #if EIGEN_GNUC_AT_LEAST(5, 4) || \
2501
- (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
2502
- return vec_cts(x, 0); // TODO: check clang version.
3506
+ template <>
3507
+ EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
3508
+ return vec_sld(a, a, 8);
3509
+ }
3510
+ template <>
3511
+ EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
3512
+ return vec_abs(a);
3513
+ }
3514
+ #ifdef __POWER8_VECTOR__
3515
+ template <>
3516
+ EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3517
+ return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
3518
+ }
2503
3519
  #else
2504
- double tmp[2];
2505
- memcpy(tmp, &x, sizeof(tmp));
2506
- Packet2l l = { static_cast<long long>(tmp[0]),
2507
- static_cast<long long>(tmp[1]) };
2508
- return l;
3520
+ #ifdef _BIG_ENDIAN
3521
+ static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3522
+ #else
3523
+ static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
2509
3524
  #endif
2510
- }
2511
3525
 
2512
- template<>
2513
- inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
2514
- unsigned long long tmp[2];
2515
- memcpy(tmp, &x, sizeof(tmp));
2516
- Packet2d d = { static_cast<double>(tmp[0]),
2517
- static_cast<double>(tmp[1]) };
2518
- return d;
3526
+ template <>
3527
+ EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3528
+ Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
3529
+ return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
2519
3530
  }
3531
+ #endif
3532
+
3533
+ template <>
3534
+ inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
2520
3535
 
3536
+ template <>
3537
+ inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
2521
3538
 
2522
3539
  // Packet2l shifts.
2523
- // For POWER8 we simply use vec_sr/l.
3540
+ // For POWER8 we simply use vec_sr/l.
2524
3541
  //
2525
3542
  // Things are more complicated for POWER7. There is actually a
2526
3543
  // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
2527
3544
  // So we need to shift by N % 32 and rearrage bytes.
2528
3545
  #ifdef __POWER8_VECTOR__
2529
3546
 
2530
- template<int N>
3547
+ template <int N>
2531
3548
  EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2532
- const Packet2ul shift = { N, N };
2533
- return vec_sl(a, shift);
3549
+ const Packet2ul shift = {N, N};
3550
+ return vec_sl(a, shift);
2534
3551
  }
2535
3552
 
2536
- template<int N>
3553
+ template <int N>
2537
3554
  EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2538
- const Packet2ul shift = { N, N };
2539
- return vec_sr(a, shift);
3555
+ const Packet2ul shift = {N, N};
3556
+ return vec_sr(a, shift);
2540
3557
  }
2541
3558
 
2542
3559
  #else
@@ -2544,34 +3561,32 @@ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2544
3561
  // Shifts [A, B, C, D] to [B, 0, D, 0].
2545
3562
  // Used to implement left shifts for Packet2l.
2546
3563
  EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
2547
- static const Packet16uc perm = {
2548
- 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
2549
- 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
2550
- #ifdef _BIG_ENDIAN
2551
- return vec_perm(p4i_ZERO, a, perm);
2552
- #else
2553
- return vec_perm(a, p4i_ZERO, perm);
2554
- #endif
3564
+ static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3565
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3566
+ #ifdef _BIG_ENDIAN
3567
+ return vec_perm(p4i_ZERO, a, perm);
3568
+ #else
3569
+ return vec_perm(a, p4i_ZERO, perm);
3570
+ #endif
2555
3571
  }
2556
3572
 
2557
3573
  // Shifts [A, B, C, D] to [0, A, 0, C].
2558
3574
  // Used to implement right shifts for Packet2l.
2559
3575
  EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
2560
- static const Packet16uc perm = {
2561
- 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
2562
- 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
2563
- #ifdef _BIG_ENDIAN
2564
- return vec_perm(p4i_ZERO, a, perm);
2565
- #else
2566
- return vec_perm(a, p4i_ZERO, perm);
2567
- #endif
3576
+ static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3577
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
3578
+ #ifdef _BIG_ENDIAN
3579
+ return vec_perm(p4i_ZERO, a, perm);
3580
+ #else
3581
+ return vec_perm(a, p4i_ZERO, perm);
3582
+ #endif
2568
3583
  }
2569
3584
 
2570
- template<int N, typename EnableIf = void>
3585
+ template <int N, typename EnableIf = void>
2571
3586
  struct plogical_shift_left_impl;
2572
3587
 
2573
- template<int N>
2574
- struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
3588
+ template <int N>
3589
+ struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
2575
3590
  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2576
3591
  static const unsigned n = static_cast<unsigned>(N);
2577
3592
  const Packet4ui shift = {n, n, n, n};
@@ -2584,8 +3599,8 @@ struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::typ
2584
3599
  }
2585
3600
  };
2586
3601
 
2587
- template<int N>
2588
- struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
3602
+ template <int N>
3603
+ struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
2589
3604
  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2590
3605
  static const unsigned m = static_cast<unsigned>(N - 32);
2591
3606
  const Packet4ui shift = {m, m, m, m};
@@ -2594,16 +3609,16 @@ struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
2594
3609
  }
2595
3610
  };
2596
3611
 
2597
- template<int N>
3612
+ template <int N>
2598
3613
  EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2599
- return plogical_shift_left_impl<N>::run(a);
3614
+ return plogical_shift_left_impl<N>::run(a);
2600
3615
  }
2601
3616
 
2602
- template<int N, typename EnableIf = void>
3617
+ template <int N, typename EnableIf = void>
2603
3618
  struct plogical_shift_right_impl;
2604
3619
 
2605
- template<int N>
2606
- struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
3620
+ template <int N>
3621
+ struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
2607
3622
  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2608
3623
  static const unsigned n = static_cast<unsigned>(N);
2609
3624
  const Packet4ui shift = {n, n, n, n};
@@ -2616,8 +3631,8 @@ struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::ty
2616
3631
  }
2617
3632
  };
2618
3633
 
2619
- template<int N>
2620
- struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
3634
+ template <int N>
3635
+ struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
2621
3636
  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2622
3637
  static const unsigned m = static_cast<unsigned>(N - 32);
2623
3638
  const Packet4ui shift = {m, m, m, m};
@@ -2626,86 +3641,89 @@ struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
2626
3641
  }
2627
3642
  };
2628
3643
 
2629
- template<int N>
3644
+ template <int N>
2630
3645
  EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2631
- return plogical_shift_right_impl<N>::run(a);
3646
+ return plogical_shift_right_impl<N>::run(a);
2632
3647
  }
2633
3648
  #endif
2634
3649
 
2635
- template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
3650
+ template <>
3651
+ EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2636
3652
  // Clamp exponent to [-2099, 2099]
2637
3653
  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
2638
3654
  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
2639
3655
 
2640
3656
  // Split 2^e into four factors and multiply:
2641
- const Packet2l bias = { 1023, 1023 };
3657
+ const Packet2l bias = {1023, 1023};
2642
3658
  Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
2643
3659
  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
2644
- Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
2645
- b = psub(psub(psub(e, b), b), b); // e - 3b
2646
- c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
2647
- out = pmul(out, c); // a * 2^e
3660
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
3661
+ b = psub(psub(psub(e, b), b), b); // e - 3b
3662
+ c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
3663
+ out = pmul(out, c); // a * 2^e
2648
3664
  return out;
2649
3665
  }
2650
3666
 
2651
-
2652
3667
  // Extract exponent without existence of Packet2l.
2653
- template<>
2654
- EIGEN_STRONG_INLINE
2655
- Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
3668
+ template <>
3669
+ EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
2656
3670
  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
2657
3671
  }
2658
3672
 
2659
- template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
3673
+ template <>
3674
+ EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
2660
3675
  return pfrexp_generic(a, exponent);
2661
3676
  }
2662
3677
 
2663
- template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
2664
- {
3678
+ template <>
3679
+ EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
2665
3680
  Packet2d b, sum;
2666
- b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
3681
+ b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
2667
3682
  sum = a + b;
2668
3683
  return pfirst<Packet2d>(sum);
2669
3684
  }
2670
3685
 
2671
3686
  // Other reduction functions:
2672
3687
  // mul
2673
- template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
2674
- {
2675
- return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3688
+ template <>
3689
+ EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
3690
+ return pfirst(
3691
+ pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2676
3692
  }
2677
3693
 
2678
3694
  // min
2679
- template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
2680
- {
2681
- return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3695
+ template <>
3696
+ EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
3697
+ return pfirst(
3698
+ pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2682
3699
  }
2683
3700
 
2684
3701
  // max
2685
- template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
2686
- {
2687
- return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3702
+ template <>
3703
+ EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
3704
+ return pfirst(
3705
+ pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2688
3706
  }
2689
3707
 
2690
- EIGEN_DEVICE_FUNC inline void
2691
- ptranspose(PacketBlock<Packet2d,2>& kernel) {
3708
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2692
3709
  Packet2d t0, t1;
2693
- t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
2694
- t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
3710
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3711
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
2695
3712
  kernel.packet[0] = t0;
2696
3713
  kernel.packet[1] = t1;
2697
3714
  }
2698
3715
 
2699
- template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
2700
- Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
2701
- Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
3716
+ template <>
3717
+ EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
3718
+ const Packet2d& elsePacket) {
3719
+ Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
3720
+ Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
2702
3721
  return vec_sel(elsePacket, thenPacket, mask);
2703
3722
  }
2704
3723
 
3724
+ #endif // __VSX__
3725
+ } // end namespace internal
2705
3726
 
2706
- #endif // __VSX__
2707
- } // end namespace internal
2708
-
2709
- } // end namespace Eigen
3727
+ } // end namespace Eigen
2710
3728
 
2711
- #endif // EIGEN_PACKET_MATH_ALTIVEC_H
3729
+ #endif // EIGEN_PACKET_MATH_ALTIVEC_H