tomoto 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (347) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/extconf.rb +6 -2
  5. data/ext/tomoto/{ext.cpp → tomoto.cpp} +1 -1
  6. data/lib/tomoto/version.rb +1 -1
  7. data/lib/tomoto.rb +5 -1
  8. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  9. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  10. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  11. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  12. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  13. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  14. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  15. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  16. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  17. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  18. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  19. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  20. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  21. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  22. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  23. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  24. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  25. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  26. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  27. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  28. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  29. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  30. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  31. data/vendor/EigenRand/README.md +57 -4
  32. data/vendor/eigen/COPYING.APACHE +203 -0
  33. data/vendor/eigen/COPYING.BSD +1 -1
  34. data/vendor/eigen/COPYING.MINPACK +51 -52
  35. data/vendor/eigen/Eigen/Cholesky +0 -1
  36. data/vendor/eigen/Eigen/Core +112 -265
  37. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  38. data/vendor/eigen/Eigen/Geometry +5 -8
  39. data/vendor/eigen/Eigen/Householder +0 -1
  40. data/vendor/eigen/Eigen/Jacobi +0 -1
  41. data/vendor/eigen/Eigen/KLUSupport +41 -0
  42. data/vendor/eigen/Eigen/LU +2 -5
  43. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  44. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  45. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  46. data/vendor/eigen/Eigen/QR +2 -3
  47. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  48. data/vendor/eigen/Eigen/SVD +0 -1
  49. data/vendor/eigen/Eigen/Sparse +0 -2
  50. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  51. data/vendor/eigen/Eigen/SparseLU +4 -0
  52. data/vendor/eigen/Eigen/SparseQR +0 -1
  53. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  54. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  55. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  56. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  57. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  58. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  59. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  60. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  61. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  62. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  63. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  64. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  65. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  66. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  67. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  68. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  69. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  70. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  71. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  72. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  73. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  74. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  75. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  76. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  77. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  78. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  79. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  80. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  81. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  82. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  83. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  84. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  85. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  86. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  87. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  88. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  89. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  90. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  91. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  92. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  93. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  94. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  95. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  96. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  97. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  98. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  99. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  100. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  101. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  102. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  103. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  104. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  105. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  106. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  107. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  108. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  109. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  110. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  111. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  112. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  113. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  114. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  115. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  116. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  117. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  118. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  119. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  120. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  121. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  122. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  123. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  124. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  125. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  126. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  127. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  128. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  129. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  130. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  131. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  132. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  134. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  135. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  139. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  140. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  142. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  146. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  148. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  155. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  157. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  158. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  160. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  161. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  162. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  169. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  171. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  172. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  173. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  174. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  175. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  176. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  177. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  178. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  179. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  180. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  181. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  182. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  183. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  184. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  185. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  186. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  187. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  188. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  189. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  190. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  191. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  192. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  193. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  194. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  195. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  196. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  197. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  198. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  199. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  200. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  201. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  202. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  203. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  204. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  205. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  206. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  207. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  208. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  209. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  210. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  211. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  212. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  213. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  214. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  215. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  216. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  217. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  218. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  219. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  220. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  221. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  222. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  223. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  224. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  225. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  226. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  227. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  228. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  229. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  230. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  231. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  232. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  233. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  234. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  235. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  236. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  237. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  238. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  239. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  240. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  241. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  242. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  243. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  244. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  245. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  246. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  247. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  248. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  249. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  250. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  251. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  252. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  253. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  254. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  255. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  256. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  257. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  258. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  259. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  260. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  261. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  262. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  263. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  264. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  265. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  266. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  267. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  268. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  269. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  270. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  271. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  283. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  287. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  288. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  289. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  290. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  291. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  292. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  293. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  294. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  295. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  296. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  297. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  298. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  299. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  300. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  301. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  302. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  303. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  304. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  305. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  306. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  307. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  308. data/vendor/eigen/README.md +2 -0
  309. data/vendor/eigen/bench/btl/README +1 -1
  310. data/vendor/eigen/bench/tensors/README +6 -7
  311. data/vendor/eigen/ci/README.md +56 -0
  312. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  313. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  314. data/vendor/eigen/unsupported/README.txt +1 -1
  315. data/vendor/tomotopy/README.kr.rst +21 -0
  316. data/vendor/tomotopy/README.rst +20 -0
  317. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  318. data/vendor/tomotopy/src/Labeling/Phraser.hpp +1 -1
  319. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +2 -1
  320. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +2 -1
  321. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +1 -1
  322. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  323. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  324. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +53 -2
  325. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +1 -1
  326. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +1 -0
  327. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +2 -2
  328. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +16 -5
  329. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  330. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
  331. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  332. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  333. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +31 -1
  334. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +2 -2
  335. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +7 -5
  336. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  337. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  338. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  339. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  340. metadata +60 -14
  341. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  342. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  343. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  344. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  345. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  346. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  347. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -22,31 +22,38 @@ namespace internal {
22
22
  #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
23
23
  #endif
24
24
 
25
- #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
26
- #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
27
- #endif
28
-
29
25
  // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
30
26
  #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
31
27
  #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
32
28
  #endif
33
29
 
34
- typedef __vector float Packet4f;
35
- typedef __vector int Packet4i;
36
- typedef __vector unsigned int Packet4ui;
37
- typedef __vector __bool int Packet4bi;
38
- typedef __vector short int Packet8i;
39
- typedef __vector unsigned char Packet16uc;
30
+ typedef __vector float Packet4f;
31
+ typedef __vector int Packet4i;
32
+ typedef __vector unsigned int Packet4ui;
33
+ typedef __vector __bool int Packet4bi;
34
+ typedef __vector short int Packet8s;
35
+ typedef __vector unsigned short int Packet8us;
36
+ typedef __vector signed char Packet16c;
37
+ typedef __vector unsigned char Packet16uc;
38
+ typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
40
39
 
41
40
  // We don't want to write the same code all the time, but we need to reuse the constants
42
41
  // and it doesn't really work to declare them global, so we define macros instead
43
-
44
42
  #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
45
- Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
43
+ Packet4f p4f_##NAME = {X, X, X, X}
46
44
 
47
45
  #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
48
46
  Packet4i p4i_##NAME = vec_splat_s32(X)
49
47
 
48
+ #define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
49
+ Packet4ui p4ui_##NAME = {X, X, X, X}
50
+
51
+ #define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
52
+ Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53
+
54
+ #define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
55
+ Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56
+
50
57
  #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
51
58
  Packet4f p4f_##NAME = pset1<Packet4f>(X)
52
59
 
@@ -64,7 +71,7 @@ typedef __vector unsigned char Packet16uc;
64
71
 
65
72
  #define DST_CHAN 1
66
73
  #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
67
-
74
+ #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
68
75
 
69
76
  // These constants are endian-agnostic
70
77
  static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
@@ -72,25 +79,36 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
72
79
  static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
73
80
  static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
74
81
  static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
82
+ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
83
+ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
84
+ static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
85
+ static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
75
86
  static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
76
87
  #ifndef __VSX__
77
88
  static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
78
89
  #endif
79
90
 
80
- static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
81
- static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
91
+ static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
92
+ static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
93
+ static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
94
+ static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
95
+
96
+ static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
97
+ 8, 9, 10, 11, 12, 13, 14, 15};
98
+ static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
99
+ 8, 9, 10, 11, 12, 13, 14, 15};
82
100
 
83
101
  static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
84
- static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
102
+ static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
103
+ static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
85
104
 
86
- // Mask alignment
87
- #ifdef __PPC64__
88
- #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
89
- #else
90
- #define _EIGEN_MASK_ALIGNMENT 0xfffffff0
91
- #endif
105
+ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
106
+ static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
107
+ static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
108
+ static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
109
+ static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
92
110
 
93
- #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
111
+ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
94
112
 
95
113
  // Handle endianness properly while loading constants
96
114
  // Define global static constants:
@@ -129,27 +147,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L
129
147
  #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
130
148
  #endif
131
149
 
132
- template<> struct packet_traits<float> : default_packet_traits
133
- {
150
+ template <>
151
+ struct packet_traits<float> : default_packet_traits {
134
152
  typedef Packet4f type;
135
153
  typedef Packet4f half;
136
154
  enum {
137
155
  Vectorizable = 1,
138
156
  AlignedOnScalar = 1,
139
- size=4,
157
+ size = 4,
140
158
  HasHalfPacket = 1,
141
159
 
142
- HasAdd = 1,
143
- HasSub = 1,
144
- HasMul = 1,
145
- HasDiv = 1,
146
- HasMin = 1,
147
- HasMax = 1,
148
- HasAbs = 1,
149
- HasSin = 0,
150
- HasCos = 0,
151
- HasLog = 0,
152
- HasExp = 1,
160
+ HasAdd = 1,
161
+ HasSub = 1,
162
+ HasMul = 1,
163
+ HasDiv = 1,
164
+ HasMin = 1,
165
+ HasMax = 1,
166
+ HasAbs = 1,
167
+ HasSin = EIGEN_FAST_MATH,
168
+ HasCos = EIGEN_FAST_MATH,
169
+ HasLog = 1,
170
+ HasExp = 1,
153
171
  #ifdef __VSX__
154
172
  HasSqrt = 1,
155
173
  #if !EIGEN_COMP_CLANG
@@ -160,16 +178,62 @@ template<> struct packet_traits<float> : default_packet_traits
160
178
  #else
161
179
  HasSqrt = 0,
162
180
  HasRsqrt = 0,
181
+ HasTanh = EIGEN_FAST_MATH,
182
+ HasErf = EIGEN_FAST_MATH,
163
183
  #endif
164
184
  HasRound = 1,
165
185
  HasFloor = 1,
166
186
  HasCeil = 1,
187
+ HasRint = 1,
167
188
  HasNegate = 1,
168
189
  HasBlend = 1
169
190
  };
170
191
  };
171
- template<> struct packet_traits<int> : default_packet_traits
172
- {
192
+ template <>
193
+ struct packet_traits<bfloat16> : default_packet_traits {
194
+ typedef Packet8bf type;
195
+ typedef Packet8bf half;
196
+ enum {
197
+ Vectorizable = 1,
198
+ AlignedOnScalar = 1,
199
+ size = 8,
200
+ HasHalfPacket = 0,
201
+
202
+ HasAdd = 1,
203
+ HasSub = 1,
204
+ HasMul = 1,
205
+ HasDiv = 1,
206
+ HasMin = 1,
207
+ HasMax = 1,
208
+ HasAbs = 1,
209
+ HasSin = EIGEN_FAST_MATH,
210
+ HasCos = EIGEN_FAST_MATH,
211
+ HasLog = 1,
212
+ HasExp = 1,
213
+ #ifdef __VSX__
214
+ HasSqrt = 1,
215
+ #if !EIGEN_COMP_CLANG
216
+ HasRsqrt = 1,
217
+ #else
218
+ HasRsqrt = 0,
219
+ #endif
220
+ #else
221
+ HasSqrt = 0,
222
+ HasRsqrt = 0,
223
+ HasTanh = EIGEN_FAST_MATH,
224
+ HasErf = EIGEN_FAST_MATH,
225
+ #endif
226
+ HasRound = 1,
227
+ HasFloor = 1,
228
+ HasCeil = 1,
229
+ HasRint = 1,
230
+ HasNegate = 1,
231
+ HasBlend = 1
232
+ };
233
+ };
234
+
235
+ template <>
236
+ struct packet_traits<int> : default_packet_traits {
173
237
  typedef Packet4i type;
174
238
  typedef Packet4i half;
175
239
  enum {
@@ -178,6 +242,79 @@ template<> struct packet_traits<int> : default_packet_traits
178
242
  size = 4,
179
243
  HasHalfPacket = 0,
180
244
 
245
+ HasAdd = 1,
246
+ HasSub = 1,
247
+ HasShift = 1,
248
+ HasMul = 1,
249
+ HasDiv = 0,
250
+ HasBlend = 1
251
+ };
252
+ };
253
+
254
+ template <>
255
+ struct packet_traits<short int> : default_packet_traits {
256
+ typedef Packet8s type;
257
+ typedef Packet8s half;
258
+ enum {
259
+ Vectorizable = 1,
260
+ AlignedOnScalar = 1,
261
+ size = 8,
262
+ HasHalfPacket = 0,
263
+
264
+ HasAdd = 1,
265
+ HasSub = 1,
266
+ HasMul = 1,
267
+ HasDiv = 0,
268
+ HasBlend = 1
269
+ };
270
+ };
271
+
272
+ template <>
273
+ struct packet_traits<unsigned short int> : default_packet_traits {
274
+ typedef Packet8us type;
275
+ typedef Packet8us half;
276
+ enum {
277
+ Vectorizable = 1,
278
+ AlignedOnScalar = 1,
279
+ size = 8,
280
+ HasHalfPacket = 0,
281
+
282
+ HasAdd = 1,
283
+ HasSub = 1,
284
+ HasMul = 1,
285
+ HasDiv = 0,
286
+ HasBlend = 1
287
+ };
288
+ };
289
+
290
+ template <>
291
+ struct packet_traits<signed char> : default_packet_traits {
292
+ typedef Packet16c type;
293
+ typedef Packet16c half;
294
+ enum {
295
+ Vectorizable = 1,
296
+ AlignedOnScalar = 1,
297
+ size = 16,
298
+ HasHalfPacket = 0,
299
+
300
+ HasAdd = 1,
301
+ HasSub = 1,
302
+ HasMul = 1,
303
+ HasDiv = 0,
304
+ HasBlend = 1
305
+ };
306
+ };
307
+
308
+ template <>
309
+ struct packet_traits<unsigned char> : default_packet_traits {
310
+ typedef Packet16uc type;
311
+ typedef Packet16uc half;
312
+ enum {
313
+ Vectorizable = 1,
314
+ AlignedOnScalar = 1,
315
+ size = 16,
316
+ HasHalfPacket = 0,
317
+
181
318
  HasAdd = 1,
182
319
  HasSub = 1,
183
320
  HasMul = 1,
@@ -186,9 +323,62 @@ template<> struct packet_traits<int> : default_packet_traits
186
323
  };
187
324
  };
188
325
 
326
+ template<> struct unpacket_traits<Packet4f>
327
+ {
328
+ typedef float type;
329
+ typedef Packet4f half;
330
+ typedef Packet4i integer_packet;
331
+ enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
332
+ };
333
+ template<> struct unpacket_traits<Packet4i>
334
+ {
335
+ typedef int type;
336
+ typedef Packet4i half;
337
+ enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
338
+ };
339
+ template<> struct unpacket_traits<Packet8s>
340
+ {
341
+ typedef short int type;
342
+ typedef Packet8s half;
343
+ enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
344
+ };
345
+ template<> struct unpacket_traits<Packet8us>
346
+ {
347
+ typedef unsigned short int type;
348
+ typedef Packet8us half;
349
+ enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
350
+ };
351
+
352
+ template<> struct unpacket_traits<Packet16c>
353
+ {
354
+ typedef signed char type;
355
+ typedef Packet16c half;
356
+ enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
357
+ };
358
+ template<> struct unpacket_traits<Packet16uc>
359
+ {
360
+ typedef unsigned char type;
361
+ typedef Packet16uc half;
362
+ enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
363
+ };
189
364
 
190
- template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
191
- template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
365
+ template<> struct unpacket_traits<Packet8bf>
366
+ {
367
+ typedef bfloat16 type;
368
+ typedef Packet8bf half;
369
+ enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
370
+ };
371
+ inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
372
+ {
373
+ union {
374
+ Packet16c v;
375
+ signed char n[16];
376
+ } vt;
377
+ vt.v = v;
378
+ for (int i=0; i< 16; i++)
379
+ s << vt.n[i] << ", ";
380
+ return s;
381
+ }
192
382
 
193
383
  inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
194
384
  {
@@ -198,7 +388,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
198
388
  } vt;
199
389
  vt.v = v;
200
390
  for (int i=0; i< 16; i++)
201
- s << (int)vt.n[i] << ", ";
391
+ s << vt.n[i] << ", ";
202
392
  return s;
203
393
  }
204
394
 
@@ -235,148 +425,397 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
235
425
  return s;
236
426
  }
237
427
 
238
- // Need to define them first or we get specialization after instantiation errors
239
- template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
428
+ template <typename Packet>
429
+ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
240
430
  {
431
+ // some versions of GCC throw "unused-but-set-parameter".
432
+ // ignoring these warnings for now.
433
+ EIGEN_UNUSED_VARIABLE(from);
241
434
  EIGEN_DEBUG_ALIGNED_LOAD
242
435
  #ifdef __VSX__
243
- return vec_vsx_ld(0, from);
436
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
244
437
  #else
245
438
  return vec_ld(0, from);
246
439
  #endif
247
440
  }
248
441
 
442
+ // Need to define them first or we get specialization after instantiation errors
443
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
444
+ {
445
+ return pload_common<Packet4f>(from);
446
+ }
447
+
249
448
  template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
250
449
  {
251
- EIGEN_DEBUG_ALIGNED_LOAD
252
- #ifdef __VSX__
253
- return vec_vsx_ld(0, from);
254
- #else
255
- return vec_ld(0, from);
256
- #endif
450
+ return pload_common<Packet4i>(from);
257
451
  }
258
452
 
259
- template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
453
+ template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
454
+ {
455
+ return pload_common<Packet8s>(from);
456
+ }
457
+
458
+ template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
459
+ {
460
+ return pload_common<Packet8us>(from);
461
+ }
462
+
463
+ template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from)
464
+ {
465
+ return pload_common<Packet16c>(from);
466
+ }
467
+
468
+ template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from)
469
+ {
470
+ return pload_common<Packet16uc>(from);
471
+ }
472
+
473
+ template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from)
260
474
  {
475
+ return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
476
+ }
477
+
478
+ template <typename Packet>
479
+ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
480
+ // some versions of GCC throw "unused-but-set-parameter" (float *to).
481
+ // ignoring these warnings for now.
482
+ EIGEN_UNUSED_VARIABLE(to);
261
483
  EIGEN_DEBUG_ALIGNED_STORE
262
484
  #ifdef __VSX__
263
- vec_vsx_st(from, 0, to);
485
+ vec_xst(from, 0, to);
264
486
  #else
265
487
  vec_st(from, 0, to);
266
488
  #endif
267
489
  }
268
490
 
491
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
492
+ {
493
+ pstore_common<Packet4f>(to, from);
494
+ }
495
+
269
496
  template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
270
497
  {
271
- EIGEN_DEBUG_ALIGNED_STORE
272
- #ifdef __VSX__
273
- vec_vsx_st(from, 0, to);
274
- #else
275
- vec_st(from, 0, to);
276
- #endif
498
+ pstore_common<Packet4i>(to, from);
277
499
  }
278
500
 
279
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
280
- Packet4f v = {from, from, from, from};
501
+ template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from)
502
+ {
503
+ pstore_common<Packet8s>(to, from);
504
+ }
505
+
506
+ template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from)
507
+ {
508
+ pstore_common<Packet8us>(to, from);
509
+ }
510
+
511
+ template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from)
512
+ {
513
+ pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
514
+ }
515
+
516
+ template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from)
517
+ {
518
+ pstore_common<Packet16c>(to, from);
519
+ }
520
+
521
+ template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from)
522
+ {
523
+ pstore_common<Packet16uc>(to, from);
524
+ }
525
+
526
+ template<typename Packet>
527
+ EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
528
+ {
529
+ Packet v = {from, from, from, from};
281
530
  return v;
282
531
  }
283
532
 
284
- template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
285
- Packet4i v = {from, from, from, from};
533
+ template<typename Packet>
534
+ EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
535
+ {
536
+ Packet v = {from, from, from, from, from, from, from, from};
286
537
  return v;
287
538
  }
288
- template<> EIGEN_STRONG_INLINE void
289
- pbroadcast4<Packet4f>(const float *a,
290
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
539
+
540
+ template<typename Packet>
541
+ EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
542
+ {
543
+ Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
544
+ return v;
545
+ }
546
+
547
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
548
+ return pset1_size4<Packet4f>(from);
549
+ }
550
+
551
+ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
552
+ return pset1_size4<Packet4i>(from);
553
+ }
554
+
555
+ template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
556
+ return pset1_size8<Packet8s>(from);
557
+ }
558
+
559
+ template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
560
+ return pset1_size8<Packet8us>(from);
561
+ }
562
+
563
+ template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
564
+ return pset1_size16<Packet16c>(from);
565
+ }
566
+
567
+ template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
568
+ return pset1_size16<Packet16uc>(from);
569
+ }
570
+
571
+ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
572
+ return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
573
+ }
574
+
575
+ template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
576
+ return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
577
+ }
578
+
579
+ template<typename Packet> EIGEN_STRONG_INLINE void
580
+ pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
581
+ Packet& a0, Packet& a1, Packet& a2, Packet& a3)
291
582
  {
292
- a3 = pload<Packet4f>(a);
583
+ a3 = pload<Packet>(a);
293
584
  a0 = vec_splat(a3, 0);
294
585
  a1 = vec_splat(a3, 1);
295
586
  a2 = vec_splat(a3, 2);
296
587
  a3 = vec_splat(a3, 3);
297
588
  }
589
+
590
+ template<> EIGEN_STRONG_INLINE void
591
+ pbroadcast4<Packet4f>(const float *a,
592
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
593
+ {
594
+ pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
595
+ }
298
596
  template<> EIGEN_STRONG_INLINE void
299
597
  pbroadcast4<Packet4i>(const int *a,
300
598
  Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
301
599
  {
302
- a3 = pload<Packet4i>(a);
303
- a0 = vec_splat(a3, 0);
304
- a1 = vec_splat(a3, 1);
305
- a2 = vec_splat(a3, 2);
306
- a3 = vec_splat(a3, 3);
600
+ pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
601
+ }
602
+
603
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
604
+ {
605
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
606
+ a[0] = from[0*stride];
607
+ a[1] = from[1*stride];
608
+ a[2] = from[2*stride];
609
+ a[3] = from[3*stride];
610
+ return pload<Packet>(a);
307
611
  }
308
612
 
309
613
  template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
310
614
  {
311
- float EIGEN_ALIGN16 af[4];
312
- af[0] = from[0*stride];
313
- af[1] = from[1*stride];
314
- af[2] = from[2*stride];
315
- af[3] = from[3*stride];
316
- return pload<Packet4f>(af);
615
+ return pgather_common<Packet4f>(from, stride);
317
616
  }
617
+
318
618
  template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
319
619
  {
320
- int EIGEN_ALIGN16 ai[4];
321
- ai[0] = from[0*stride];
322
- ai[1] = from[1*stride];
323
- ai[2] = from[2*stride];
324
- ai[3] = from[3*stride];
325
- return pload<Packet4i>(ai);
620
+ return pgather_common<Packet4i>(from, stride);
326
621
  }
327
- template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
622
+
623
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
328
624
  {
329
- float EIGEN_ALIGN16 af[4];
330
- pstore<float>(af, from);
331
- to[0*stride] = af[0];
332
- to[1*stride] = af[1];
333
- to[2*stride] = af[2];
334
- to[3*stride] = af[3];
625
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
626
+ a[0] = from[0*stride];
627
+ a[1] = from[1*stride];
628
+ a[2] = from[2*stride];
629
+ a[3] = from[3*stride];
630
+ a[4] = from[4*stride];
631
+ a[5] = from[5*stride];
632
+ a[6] = from[6*stride];
633
+ a[7] = from[7*stride];
634
+ return pload<Packet>(a);
335
635
  }
336
- template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
636
+
637
+ template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
337
638
  {
338
- int EIGEN_ALIGN16 ai[4];
339
- pstore<int>((int *)ai, from);
340
- to[0*stride] = ai[0];
341
- to[1*stride] = ai[1];
342
- to[2*stride] = ai[2];
343
- to[3*stride] = ai[3];
639
+ return pgather_size8<Packet8s>(from, stride);
344
640
  }
345
641
 
346
- template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
347
- template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
348
-
349
- template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
350
- template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
642
+ template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
643
+ {
644
+ return pgather_size8<Packet8us>(from, stride);
645
+ }
351
646
 
352
- template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
353
- template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
647
+ template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
648
+ {
649
+ return pgather_size8<Packet8bf>(from, stride);
650
+ }
354
651
 
355
- template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
356
- template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
652
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
653
+ {
654
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
655
+ a[0] = from[0*stride];
656
+ a[1] = from[1*stride];
657
+ a[2] = from[2*stride];
658
+ a[3] = from[3*stride];
659
+ a[4] = from[4*stride];
660
+ a[5] = from[5*stride];
661
+ a[6] = from[6*stride];
662
+ a[7] = from[7*stride];
663
+ a[8] = from[8*stride];
664
+ a[9] = from[9*stride];
665
+ a[10] = from[10*stride];
666
+ a[11] = from[11*stride];
667
+ a[12] = from[12*stride];
668
+ a[13] = from[13*stride];
669
+ a[14] = from[14*stride];
670
+ a[15] = from[15*stride];
671
+ return pload<Packet>(a);
672
+ }
357
673
 
358
- template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
359
- template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
360
674
 
361
- template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
362
- template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
675
+ template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
676
+ {
677
+ return pgather_size16<Packet16c>(from, stride);
678
+ }
363
679
 
364
- template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
680
+ template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
365
681
  {
366
- #ifndef __VSX__ // VSX actually provides a div instruction
367
- Packet4f t, y_0, y_1;
682
+ return pgather_size16<Packet16uc>(from, stride);
683
+ }
368
684
 
369
- // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
370
- y_0 = vec_re(b);
685
+ template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
686
+ {
687
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
688
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
689
+ to[0*stride] = a[0];
690
+ to[1*stride] = a[1];
691
+ to[2*stride] = a[2];
692
+ to[3*stride] = a[3];
693
+ }
371
694
 
372
- // Do one Newton-Raphson iteration to get the needed accuracy
373
- t = vec_nmsub(y_0, b, p4f_ONE);
374
- y_1 = vec_madd(y_0, t, y_0);
695
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
696
+ {
697
+ pscatter_size4<Packet4f>(to, from, stride);
698
+ }
375
699
 
376
- return vec_madd(a, y_1, p4f_MZERO);
377
- #else
378
- return vec_div(a, b);
379
- #endif
700
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
701
+ {
702
+ pscatter_size4<Packet4i>(to, from, stride);
703
+ }
704
+
705
+ template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
706
+ {
707
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
708
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
709
+ to[0*stride] = a[0];
710
+ to[1*stride] = a[1];
711
+ to[2*stride] = a[2];
712
+ to[3*stride] = a[3];
713
+ to[4*stride] = a[4];
714
+ to[5*stride] = a[5];
715
+ to[6*stride] = a[6];
716
+ to[7*stride] = a[7];
717
+ }
718
+
719
+
720
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
721
+ {
722
+ pscatter_size8<Packet8s>(to, from, stride);
723
+ }
724
+
725
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
726
+ {
727
+ pscatter_size8<Packet8us>(to, from, stride);
728
+ }
729
+
730
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
731
+ {
732
+ pscatter_size8<Packet8bf>(to, from, stride);
733
+ }
734
+
735
+ template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
736
+ {
737
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
738
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
739
+ to[0*stride] = a[0];
740
+ to[1*stride] = a[1];
741
+ to[2*stride] = a[2];
742
+ to[3*stride] = a[3];
743
+ to[4*stride] = a[4];
744
+ to[5*stride] = a[5];
745
+ to[6*stride] = a[6];
746
+ to[7*stride] = a[7];
747
+ to[8*stride] = a[8];
748
+ to[9*stride] = a[9];
749
+ to[10*stride] = a[10];
750
+ to[11*stride] = a[11];
751
+ to[12*stride] = a[12];
752
+ to[13*stride] = a[13];
753
+ to[14*stride] = a[14];
754
+ to[15*stride] = a[15];
755
+ }
756
+
757
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
758
+ {
759
+ pscatter_size16<Packet16c>(to, from, stride);
760
+ }
761
+
762
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
763
+ {
764
+ pscatter_size16<Packet16uc>(to, from, stride);
765
+ }
766
+
767
+ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
768
+ template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
769
+ template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
770
+ template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
771
+ template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
772
+ template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
773
+
774
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; }
775
+ template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; }
776
+ template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; }
777
+ template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; }
778
+ template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; }
779
+ template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; }
780
+ template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
781
+
782
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; }
783
+ template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; }
784
+ template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; }
785
+ template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; }
786
+ template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
787
+ template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
788
+
789
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
790
+ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
791
+
792
+ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
793
+ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
794
+
795
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
796
+ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; }
797
+ template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); }
798
+ template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); }
799
+ template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
800
+ template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
801
+
802
+
803
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
804
+ {
805
+ #ifndef __VSX__ // VSX actually provides a div instruction
806
+ Packet4f t, y_0, y_1;
807
+
808
+ // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
809
+ y_0 = vec_re(b);
810
+
811
+ // Do one Newton-Raphson iteration to get the needed accuracy
812
+ t = vec_nmsub(y_0, b, p4f_ONE);
813
+ y_1 = vec_madd(y_0, t, y_0);
814
+
815
+ return vec_madd(a, y_1, p4f_MZERO);
816
+ #else
817
+ return vec_div(a, b);
818
+ #endif
380
819
  }
381
820
 
382
821
  template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
@@ -387,10 +826,13 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
387
826
  // for some weird raisons, it has to be overloaded for packet of integers
388
827
  template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
389
828
  template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
829
+ template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
830
+ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
390
831
 
391
832
  template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
392
833
  {
393
834
  #ifdef __VSX__
835
+ // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
394
836
  Packet4f ret;
395
837
  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
396
838
  return ret;
@@ -399,10 +841,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
399
841
  #endif
400
842
  }
401
843
  template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
844
+ template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
845
+ template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
846
+ template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
847
+ template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
848
+
402
849
 
403
850
  template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
404
851
  {
405
852
  #ifdef __VSX__
853
+ // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
406
854
  Packet4f ret;
407
855
  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
408
856
  return ret;
@@ -411,79 +859,214 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
411
859
  #endif
412
860
  }
413
861
  template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
862
+ template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
863
+ template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
864
+ template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
865
+ template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
866
+
867
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
868
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
869
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
870
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
871
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
872
+ return vec_nor(c,c);
873
+ }
874
+
875
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
876
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
877
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
878
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
879
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
880
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
881
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
882
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
883
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
884
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
885
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
886
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
887
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
888
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
889
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
414
890
 
415
891
  template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
416
892
  template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
893
+ template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
894
+ template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
895
+ template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
896
+ return pand<Packet8us>(a, b);
897
+ }
898
+
417
899
 
418
900
  template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
419
901
  template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
902
+ template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
903
+ template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
904
+ template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
905
+ return por<Packet8us>(a, b);
906
+ }
420
907
 
421
908
  template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
422
909
  template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
910
+ template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
911
+ return pxor<Packet8us>(a, b);
912
+ }
423
913
 
424
- template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
425
- template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
914
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
915
+ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
426
916
 
427
- template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
917
+ template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
918
+ return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
919
+ }
920
+
921
+ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
922
+ {
923
+ Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
924
+ Packet4f res;
925
+
926
+ #ifdef __VSX__
927
+ __asm__("xvrspiz %x0, %x1\n\t"
928
+ : "=&wa" (res)
929
+ : "wa" (t));
930
+ #else
931
+ __asm__("vrfiz %0, %1\n\t"
932
+ : "=v" (res)
933
+ : "v" (t));
934
+ #endif
935
+
936
+ return res;
937
+ }
428
938
  template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
429
939
  template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
940
+ template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
941
+ {
942
+ Packet4f res;
430
943
 
431
- #ifdef _BIG_ENDIAN
432
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
944
+ __asm__("xvrspic %x0, %x1\n\t"
945
+ : "=&wa" (res)
946
+ : "wa" (a));
947
+
948
+ return res;
949
+ }
950
+
951
+ template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
433
952
  {
434
953
  EIGEN_DEBUG_ALIGNED_LOAD
954
+ #ifdef _BIG_ENDIAN
435
955
  Packet16uc MSQ, LSQ;
436
956
  Packet16uc mask;
437
957
  MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
438
958
  LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
439
959
  mask = vec_lvsl(0, from); // create the permute mask
440
- return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
960
+ //TODO: Add static_cast here
961
+ return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
962
+ #else
963
+ EIGEN_DEBUG_UNALIGNED_LOAD
964
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
965
+ #endif
966
+ }
441
967
 
968
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
969
+ {
970
+ return ploadu_common<Packet4f>(from);
442
971
  }
443
972
  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
444
973
  {
445
- EIGEN_DEBUG_ALIGNED_LOAD
446
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
447
- Packet16uc MSQ, LSQ;
448
- Packet16uc mask;
449
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
450
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
451
- mask = vec_lvsl(0, from); // create the permute mask
452
- return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
974
+ return ploadu_common<Packet4i>(from);
453
975
  }
454
- #else
455
- // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
456
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
976
+ template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
457
977
  {
458
- EIGEN_DEBUG_UNALIGNED_LOAD
459
- return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
978
+ return ploadu_common<Packet8s>(from);
460
979
  }
461
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
980
+ template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
462
981
  {
463
- EIGEN_DEBUG_UNALIGNED_LOAD
464
- return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
982
+ return ploadu_common<Packet8us>(from);
983
+ }
984
+ template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
985
+ {
986
+ return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
987
+ }
988
+ template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
989
+ {
990
+ return ploadu_common<Packet16c>(from);
991
+ }
992
+ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
993
+ {
994
+ return ploadu_common<Packet16uc>(from);
465
995
  }
466
- #endif
467
996
 
468
- template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
997
+ template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
469
998
  {
470
- Packet4f p;
471
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
472
- else p = ploadu<Packet4f>(from);
999
+ Packet p;
1000
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
1001
+ else p = ploadu<Packet>(from);
473
1002
  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
474
1003
  }
1004
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1005
+ {
1006
+ return ploaddup_common<Packet4f>(from);
1007
+ }
475
1008
  template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
476
1009
  {
477
- Packet4i p;
478
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
479
- else p = ploadu<Packet4i>(from);
480
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1010
+ return ploaddup_common<Packet4i>(from);
481
1011
  }
482
1012
 
483
- #ifdef _BIG_ENDIAN
484
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1013
+ template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from)
1014
+ {
1015
+ Packet8s p;
1016
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1017
+ else p = ploadu<Packet8s>(from);
1018
+ return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1019
+ }
1020
+
1021
+ template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
1022
+ {
1023
+ Packet8us p;
1024
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1025
+ else p = ploadu<Packet8us>(from);
1026
+ return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1027
+ }
1028
+
1029
+ template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
1030
+ {
1031
+ Packet8s p;
1032
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1033
+ else p = ploadu<Packet8s>(from);
1034
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1035
+ }
1036
+
1037
+ template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from)
1038
+ {
1039
+ Packet8us p;
1040
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1041
+ else p = ploadu<Packet8us>(from);
1042
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1043
+ }
1044
+
1045
+ template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from)
1046
+ {
1047
+ return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1048
+ }
1049
+
1050
+ template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from)
1051
+ {
1052
+ Packet16c p;
1053
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
1054
+ else p = ploadu<Packet16c>(from);
1055
+ return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1056
+ }
1057
+
1058
+ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
1059
+ {
1060
+ Packet16uc p;
1061
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
1062
+ else p = ploadu<Packet16uc>(from);
1063
+ return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1064
+ }
1065
+
1066
+ template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
485
1067
  {
486
1068
  EIGEN_DEBUG_UNALIGNED_STORE
1069
+ #ifdef _BIG_ENDIAN
487
1070
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
488
1071
  // Warning: not thread safe!
489
1072
  Packet16uc MSQ, LSQ, edges;
@@ -497,45 +1080,69 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& f
497
1080
  MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
498
1081
  LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
499
1082
  vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
500
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
1083
+ vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
1084
+ #else
1085
+ vec_xst(from, 0, to);
1086
+ #endif
1087
+ }
1088
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1089
+ {
1090
+ pstoreu_common<Packet4f>(to, from);
501
1091
  }
502
1092
  template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
503
1093
  {
504
- EIGEN_DEBUG_UNALIGNED_STORE
505
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
506
- // Warning: not thread safe!
507
- Packet16uc MSQ, LSQ, edges;
508
- Packet16uc edgeAlign, align;
509
-
510
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
511
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
512
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
513
- edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
514
- align = vec_lvsr( 0, to ); // permute map to misalign data
515
- MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
516
- LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
517
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
518
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
1094
+ pstoreu_common<Packet4i>(to, from);
519
1095
  }
520
- #else
521
- // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
522
- template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
1096
+ template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from)
523
1097
  {
524
- EIGEN_DEBUG_ALIGNED_STORE
525
- vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
1098
+ pstoreu_common<Packet8s>(to, from);
526
1099
  }
527
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1100
+ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from)
528
1101
  {
529
- EIGEN_DEBUG_ALIGNED_STORE
530
- vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
1102
+ pstoreu_common<Packet8us>(to, from);
1103
+ }
1104
+ template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from)
1105
+ {
1106
+ pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
1107
+ }
1108
+ template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from)
1109
+ {
1110
+ pstoreu_common<Packet16c>(to, from);
1111
+ }
1112
+ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from)
1113
+ {
1114
+ pstoreu_common<Packet16uc>(to, from);
531
1115
  }
532
- #endif
533
1116
 
534
1117
  template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
535
1118
  template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
536
1119
 
537
- template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
538
- template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
1120
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
1121
+ template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
1122
+
1123
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1124
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1125
+ vec_ste(a, 0, &x);
1126
+ return x;
1127
+ }
1128
+
1129
+ template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1130
+ return pfirst_common<Packet8s>(a);
1131
+ }
1132
+
1133
+ template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1134
+ return pfirst_common<Packet8us>(a);
1135
+ }
1136
+
1137
+ template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
1138
+ {
1139
+ return pfirst_common<Packet16c>(a);
1140
+ }
1141
+
1142
+ template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
1143
+ {
1144
+ return pfirst_common<Packet16uc>(a);
1145
+ }
539
1146
 
540
1147
  template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
541
1148
  {
@@ -543,10 +1150,296 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
543
1150
  }
544
1151
  template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
545
1152
  {
546
- return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
1153
+ return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1154
+ }
1155
+ template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
1156
+ {
1157
+ return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1158
+ }
1159
+ template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
1160
+ {
1161
+ return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1162
+ }
1163
+ template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
1164
+ {
1165
+ return vec_perm(a, a, p16uc_REVERSE8);
1166
+ }
1167
+ template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
1168
+ {
1169
+ return vec_perm(a, a, p16uc_REVERSE8);
1170
+ }
1171
+ template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
1172
+ {
1173
+ return preverse<Packet8us>(a);
1174
+ }
547
1175
 
548
1176
  template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
549
1177
  template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
1178
+ template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
1179
+ template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
1180
+ template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
1181
+ template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
1182
+ template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1183
+ _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
1184
+ return pand<Packet8us>(p8us_abs_mask, a);
1185
+ }
1186
+
1187
+ template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
1188
+ { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1189
+ template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
1190
+ { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1191
+ template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
1192
+ { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1193
+ template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
1194
+ {
1195
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1196
+ Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1197
+ return reinterpret_cast<Packet4f>(r);
1198
+ }
1199
+
1200
+ template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
1201
+ {
1202
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1203
+ Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1204
+ return reinterpret_cast<Packet4f>(r);
1205
+ }
1206
+
1207
+ template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
1208
+ {
1209
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1210
+ return vec_sr(a, p4ui_mask);
1211
+ }
1212
+
1213
+ template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
1214
+ {
1215
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1216
+ return vec_sl(a, p4ui_mask);
1217
+ }
1218
+
1219
+ template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
1220
+ {
1221
+ const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1222
+ return vec_sl(a, p8us_mask);
1223
+ }
1224
+ template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
1225
+ {
1226
+ const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1227
+ return vec_sr(a, p8us_mask);
1228
+ }
1229
+
1230
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
1231
+ return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1232
+ }
1233
+
1234
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
1235
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1236
+ return pand<Packet4f>(
1237
+ reinterpret_cast<Packet4f>(bf.m_val),
1238
+ reinterpret_cast<Packet4f>(p4ui_high_mask)
1239
+ );
1240
+ }
1241
+
1242
+ // Simple interleaving of bool masks, prevents true values from being
1243
+ // converted to NaNs.
1244
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1245
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1246
+ Packet4f bf_odd, bf_even;
1247
+ bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
1248
+ bf_even = plogical_shift_right<16>(even);
1249
+ return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1250
+ }
1251
+
1252
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
1253
+ Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
1254
+ Packet4ui lsb = plogical_shift_right<16>(input);
1255
+ lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
1256
+
1257
+ _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
1258
+ Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
1259
+ input = padd<Packet4ui>(input, rounding_bias);
1260
+
1261
+ //Test NaN and Subnormal - Begin
1262
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
1263
+ Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
1264
+
1265
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
1266
+ Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
1267
+
1268
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
1269
+ Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
1270
+ Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
1271
+
1272
+ Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
1273
+ Packet4ui nan_selector = pandnot<Packet4ui>(
1274
+ reinterpret_cast<Packet4ui>(is_max_exp),
1275
+ reinterpret_cast<Packet4ui>(is_mant_zero)
1276
+ );
1277
+
1278
+ Packet4ui subnormal_selector = pandnot<Packet4ui>(
1279
+ reinterpret_cast<Packet4ui>(is_zero_exp),
1280
+ reinterpret_cast<Packet4ui>(is_mant_zero)
1281
+ );
1282
+
1283
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
1284
+ input = vec_sel(input, p4ui_nan, nan_selector);
1285
+ input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
1286
+ //Test NaN and Subnormal - End
1287
+
1288
+ input = plogical_shift_right<16>(input);
1289
+ return reinterpret_cast<Packet8us>(input);
1290
+ }
1291
+
1292
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
1293
+ Packet4f bf_odd, bf_even;
1294
+ bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
1295
+ bf_odd = plogical_shift_left<16>(bf_odd);
1296
+ bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
1297
+ return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1298
+ }
1299
+ #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
1300
+ Packet4f a_even = Bf16ToF32Even(A);\
1301
+ Packet4f a_odd = Bf16ToF32Odd(A);\
1302
+ Packet4f op_even = OP(a_even);\
1303
+ Packet4f op_odd = OP(a_odd);\
1304
+ return F32ToBf16(op_even, op_odd);\
1305
+
1306
+ #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
1307
+ Packet4f a_even = Bf16ToF32Even(A);\
1308
+ Packet4f a_odd = Bf16ToF32Odd(A);\
1309
+ Packet4f b_even = Bf16ToF32Even(B);\
1310
+ Packet4f b_odd = Bf16ToF32Odd(B);\
1311
+ Packet4f op_even = OP(a_even, b_even);\
1312
+ Packet4f op_odd = OP(a_odd, b_odd);\
1313
+ return F32ToBf16(op_even, op_odd);\
1314
+
1315
+ #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
1316
+ Packet4f a_even = Bf16ToF32Even(A);\
1317
+ Packet4f a_odd = Bf16ToF32Odd(A);\
1318
+ Packet4f b_even = Bf16ToF32Even(B);\
1319
+ Packet4f b_odd = Bf16ToF32Odd(B);\
1320
+ Packet4f op_even = OP(a_even, b_even);\
1321
+ Packet4f op_odd = OP(a_odd, b_odd);\
1322
+ return F32ToBf16Bool(op_even, op_odd);\
1323
+
1324
+ template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1325
+ BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
1326
+ }
1327
+
1328
+ template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1329
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
1330
+ }
1331
+
1332
+ template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1333
+ BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
1334
+ }
1335
+
1336
+ template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
1337
+ BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
1338
+ }
1339
+
1340
+ template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1341
+ BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
1342
+ }
1343
+
1344
+ template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
1345
+ BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
1346
+ }
1347
+ template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
1348
+ BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
1349
+ }
1350
+ template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
1351
+ BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
1352
+ }
1353
+
1354
+ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1355
+ return pldexp_generic(a,exponent);
1356
+ }
1357
+ template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
1358
+ BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
1359
+ }
1360
+
1361
+ template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1362
+ return pfrexp_generic(a,exponent);
1363
+ }
1364
+ template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
1365
+ Packet4f a_even = Bf16ToF32Even(a);
1366
+ Packet4f a_odd = Bf16ToF32Odd(a);
1367
+ Packet4f e_even;
1368
+ Packet4f e_odd;
1369
+ Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
1370
+ Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
1371
+ e = F32ToBf16(e_even, e_odd);
1372
+ return F32ToBf16(op_even, op_odd);
1373
+ }
1374
+
1375
+ template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
1376
+ BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
1377
+ }
1378
+ template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
1379
+ BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
1380
+ }
1381
+ template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
1382
+ BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
1383
+ }
1384
+ template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
1385
+ BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
1386
+ }
1387
+ template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
1388
+ BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
1389
+ }
1390
+ template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
1391
+ BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
1392
+ }
1393
+ template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
1394
+ BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
1395
+ }
1396
+ template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
1397
+ Packet4f a_even = Bf16ToF32Even(a);
1398
+ Packet4f a_odd = Bf16ToF32Odd(a);
1399
+ Packet4f b_even = Bf16ToF32Even(b);
1400
+ Packet4f b_odd = Bf16ToF32Odd(b);
1401
+ Packet4f c_even = Bf16ToF32Even(c);
1402
+ Packet4f c_odd = Bf16ToF32Odd(c);
1403
+ Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
1404
+ Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
1405
+ return F32ToBf16(pmadd_even, pmadd_odd);
1406
+ }
1407
+
1408
+ template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1409
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
1410
+ }
1411
+
1412
+ template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1413
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
1414
+ }
1415
+
1416
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
1417
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
1418
+ }
1419
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
1420
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
1421
+ }
1422
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
1423
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
1424
+ }
1425
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
1426
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
1427
+ }
1428
+
1429
+ template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
1430
+ return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
1431
+ }
1432
+
1433
+ template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from)
1434
+ {
1435
+ return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1436
+ }
1437
+
1438
+ template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
1439
+ bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
1440
+ bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
1441
+ return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
1442
+ }
550
1443
 
551
1444
  template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
552
1445
  {
@@ -558,34 +1451,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
558
1451
  return pfirst(sum);
559
1452
  }
560
1453
 
561
- template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
562
- {
563
- Packet4f v[4], sum[4];
564
-
565
- // It's easier and faster to transpose then add as columns
566
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
567
- // Do the transpose, first set of moves
568
- v[0] = vec_mergeh(vecs[0], vecs[2]);
569
- v[1] = vec_mergel(vecs[0], vecs[2]);
570
- v[2] = vec_mergeh(vecs[1], vecs[3]);
571
- v[3] = vec_mergel(vecs[1], vecs[3]);
572
- // Get the resulting vectors
573
- sum[0] = vec_mergeh(v[0], v[2]);
574
- sum[1] = vec_mergel(v[0], v[2]);
575
- sum[2] = vec_mergeh(v[1], v[3]);
576
- sum[3] = vec_mergel(v[1], v[3]);
577
-
578
- // Now do the summation:
579
- // Lines 0+1
580
- sum[0] = sum[0] + sum[1];
581
- // Lines 2+3
582
- sum[1] = sum[2] + sum[3];
583
- // Add the results
584
- sum[0] = sum[0] + sum[1];
585
-
586
- return sum[0];
587
- }
588
-
589
1454
  template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
590
1455
  {
591
1456
  Packet4i sum;
@@ -598,141 +1463,377 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
598
1463
  return pfirst(sum);
599
1464
  }
600
1465
 
601
- template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
1466
+ template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
1467
+ {
1468
+ float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
1469
+ float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
1470
+ float f32_result = redux_even + redux_odd;
1471
+ return bfloat16(f32_result);
1472
+ }
1473
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
1474
+ {
1475
+ union{
1476
+ Packet v;
1477
+ __UNPACK_TYPE__(Packet) n[8];
1478
+ } vt;
1479
+ vt.v = a;
1480
+
1481
+ EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1482
+ EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1483
+ Packet4i first_half = pload<Packet4i>(first_loader);
1484
+ Packet4i second_half = pload<Packet4i>(second_loader);
1485
+
1486
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
1487
+ }
1488
+
1489
+ template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
1490
+ {
1491
+ return predux_size8<Packet8s>(a);
1492
+ }
1493
+
1494
+ template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
1495
+ {
1496
+ return predux_size8<Packet8us>(a);
1497
+ }
1498
+
1499
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
1500
+ {
1501
+ union{
1502
+ Packet v;
1503
+ __UNPACK_TYPE__(Packet) n[16];
1504
+ } vt;
1505
+ vt.v = a;
1506
+
1507
+ EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1508
+ EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1509
+ EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
1510
+ EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
1511
+
1512
+ Packet4i first_quarter = pload<Packet4i>(first_loader);
1513
+ Packet4i second_quarter = pload<Packet4i>(second_loader);
1514
+ Packet4i third_quarter = pload<Packet4i>(third_loader);
1515
+ Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
1516
+
1517
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
1518
+ + predux(third_quarter) + predux(fourth_quarter));
1519
+ }
1520
+
1521
+ template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
1522
+ {
1523
+ return predux_size16<Packet16c>(a);
1524
+ }
1525
+
1526
+ template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
1527
+ {
1528
+ return predux_size16<Packet16uc>(a);
1529
+ }
1530
+
1531
+ // Other reduction functions:
1532
+ // mul
1533
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1534
+ {
1535
+ Packet4f prod;
1536
+ prod = pmul(a, vec_sld(a, a, 8));
1537
+ return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1538
+ }
1539
+
1540
+ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1541
+ {
1542
+ EIGEN_ALIGN16 int aux[4];
1543
+ pstore(aux, a);
1544
+ return aux[0] * aux[1] * aux[2] * aux[3];
1545
+ }
1546
+
1547
+ template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
1548
+ {
1549
+ Packet8s pair, quad, octo;
1550
+
1551
+ pair = vec_mul(a, vec_sld(a, a, 8));
1552
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1553
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1554
+
1555
+ return pfirst(octo);
1556
+ }
1557
+
1558
+ template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
1559
+ {
1560
+ Packet8us pair, quad, octo;
1561
+
1562
+ pair = vec_mul(a, vec_sld(a, a, 8));
1563
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1564
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1565
+
1566
+ return pfirst(octo);
1567
+ }
1568
+
1569
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
1570
+ {
1571
+ float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
1572
+ float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
1573
+ float f32_result = redux_even * redux_odd;
1574
+ return bfloat16(f32_result);
1575
+ }
1576
+
1577
+
1578
+ template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
1579
+ {
1580
+ Packet16c pair, quad, octo, result;
1581
+
1582
+ pair = vec_mul(a, vec_sld(a, a, 8));
1583
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1584
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1585
+ result = vec_mul(octo, vec_sld(octo, octo, 1));
1586
+
1587
+ return pfirst(result);
1588
+ }
1589
+
1590
+ template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
1591
+ {
1592
+ Packet16uc pair, quad, octo, result;
1593
+
1594
+ pair = vec_mul(a, vec_sld(a, a, 8));
1595
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1596
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1597
+ result = vec_mul(octo, vec_sld(octo, octo, 1));
1598
+
1599
+ return pfirst(result);
1600
+ }
1601
+
1602
+ // min
1603
+ template<typename Packet> EIGEN_STRONG_INLINE
1604
+ __UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
1605
+ {
1606
+ Packet b, res;
1607
+ b = vec_min(a, vec_sld(a, a, 8));
1608
+ res = vec_min(b, vec_sld(b, b, 4));
1609
+ return pfirst(res);
1610
+ }
1611
+
1612
+
1613
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1614
+ {
1615
+ return predux_min4<Packet4f>(a);
1616
+ }
1617
+
1618
+ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
1619
+ {
1620
+ return predux_min4<Packet4i>(a);
1621
+ }
1622
+
1623
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
1624
+ {
1625
+ float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
1626
+ float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
1627
+ float f32_result = (std::min)(redux_even, redux_odd);
1628
+ return bfloat16(f32_result);
1629
+ }
1630
+
1631
+ template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
1632
+ {
1633
+ Packet8s pair, quad, octo;
1634
+
1635
+ //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1636
+ pair = vec_min(a, vec_sld(a, a, 8));
1637
+
1638
+ //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1639
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1640
+
1641
+ //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1642
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1643
+ return pfirst(octo);
1644
+ }
1645
+
1646
+ template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
1647
+ {
1648
+ Packet8us pair, quad, octo;
1649
+
1650
+ //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1651
+ pair = vec_min(a, vec_sld(a, a, 8));
1652
+
1653
+ //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1654
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1655
+
1656
+ //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1657
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1658
+ return pfirst(octo);
1659
+ }
1660
+
1661
+ template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
1662
+ {
1663
+ Packet16c pair, quad, octo, result;
1664
+
1665
+ pair = vec_min(a, vec_sld(a, a, 8));
1666
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1667
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1668
+ result = vec_min(octo, vec_sld(octo, octo, 1));
1669
+
1670
+ return pfirst(result);
1671
+ }
1672
+
1673
+ template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
1674
+ {
1675
+ Packet16uc pair, quad, octo, result;
1676
+
1677
+ pair = vec_min(a, vec_sld(a, a, 8));
1678
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1679
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1680
+ result = vec_min(octo, vec_sld(octo, octo, 1));
1681
+
1682
+ return pfirst(result);
1683
+ }
1684
+ // max
1685
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
1686
+ {
1687
+ Packet b, res;
1688
+ b = vec_max(a, vec_sld(a, a, 8));
1689
+ res = vec_max(b, vec_sld(b, b, 4));
1690
+ return pfirst(res);
1691
+ }
1692
+
1693
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1694
+ {
1695
+ return predux_max4<Packet4f>(a);
1696
+ }
1697
+
1698
+ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
1699
+ {
1700
+ return predux_max4<Packet4i>(a);
1701
+ }
1702
+
1703
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
1704
+ {
1705
+ float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
1706
+ float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
1707
+ float f32_result = (std::max)(redux_even, redux_odd);
1708
+ return bfloat16(f32_result);
1709
+ }
1710
+
1711
+ template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
1712
+ {
1713
+ Packet8s pair, quad, octo;
1714
+
1715
+ //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1716
+ pair = vec_max(a, vec_sld(a, a, 8));
1717
+
1718
+ //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1719
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1720
+
1721
+ //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1722
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1723
+ return pfirst(octo);
1724
+ }
1725
+
1726
+ template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
602
1727
  {
603
- Packet4i v[4], sum[4];
1728
+ Packet8us pair, quad, octo;
1729
+
1730
+ //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1731
+ pair = vec_max(a, vec_sld(a, a, 8));
604
1732
 
605
- // It's easier and faster to transpose then add as columns
606
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
607
- // Do the transpose, first set of moves
608
- v[0] = vec_mergeh(vecs[0], vecs[2]);
609
- v[1] = vec_mergel(vecs[0], vecs[2]);
610
- v[2] = vec_mergeh(vecs[1], vecs[3]);
611
- v[3] = vec_mergel(vecs[1], vecs[3]);
612
- // Get the resulting vectors
613
- sum[0] = vec_mergeh(v[0], v[2]);
614
- sum[1] = vec_mergel(v[0], v[2]);
615
- sum[2] = vec_mergeh(v[1], v[3]);
616
- sum[3] = vec_mergel(v[1], v[3]);
1733
+ //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1734
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
617
1735
 
618
- // Now do the summation:
619
- // Lines 0+1
620
- sum[0] = sum[0] + sum[1];
621
- // Lines 2+3
622
- sum[1] = sum[2] + sum[3];
623
- // Add the results
624
- sum[0] = sum[0] + sum[1];
625
-
626
- return sum[0];
1736
+ //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1737
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1738
+ return pfirst(octo);
627
1739
  }
628
1740
 
629
- // Other reduction functions:
630
- // mul
631
- template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1741
+ template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
632
1742
  {
633
- Packet4f prod;
634
- prod = pmul(a, vec_sld(a, a, 8));
635
- return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1743
+ Packet16c pair, quad, octo, result;
1744
+
1745
+ pair = vec_max(a, vec_sld(a, a, 8));
1746
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1747
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1748
+ result = vec_max(octo, vec_sld(octo, octo, 1));
1749
+
1750
+ return pfirst(result);
636
1751
  }
637
1752
 
638
- template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1753
+ template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
639
1754
  {
640
- EIGEN_ALIGN16 int aux[4];
641
- pstore(aux, a);
642
- return aux[0] * aux[1] * aux[2] * aux[3];
1755
+ Packet16uc pair, quad, octo, result;
1756
+
1757
+ pair = vec_max(a, vec_sld(a, a, 8));
1758
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1759
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1760
+ result = vec_max(octo, vec_sld(octo, octo, 1));
1761
+
1762
+ return pfirst(result);
643
1763
  }
644
1764
 
645
- // min
646
- template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1765
+ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
647
1766
  {
648
- Packet4f b, res;
649
- b = vec_min(a, vec_sld(a, a, 8));
650
- res = vec_min(b, vec_sld(b, b, 4));
651
- return pfirst(res);
1767
+ return vec_any_ne(x, pzero(x));
652
1768
  }
653
1769
 
654
- template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
655
- {
656
- Packet4i b, res;
657
- b = vec_min(a, vec_sld(a, a, 8));
658
- res = vec_min(b, vec_sld(b, b, 4));
659
- return pfirst(res);
1770
+ template <typename T> EIGEN_DEVICE_FUNC inline void
1771
+ ptranpose_common(PacketBlock<T,4>& kernel){
1772
+ T t0, t1, t2, t3;
1773
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1774
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1775
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1776
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1777
+ kernel.packet[0] = vec_mergeh(t0, t2);
1778
+ kernel.packet[1] = vec_mergel(t0, t2);
1779
+ kernel.packet[2] = vec_mergeh(t1, t3);
1780
+ kernel.packet[3] = vec_mergel(t1, t3);
660
1781
  }
661
1782
 
662
- // max
663
- template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
664
- {
665
- Packet4f b, res;
666
- b = vec_max(a, vec_sld(a, a, 8));
667
- res = vec_max(b, vec_sld(b, b, 4));
668
- return pfirst(res);
1783
+ EIGEN_DEVICE_FUNC inline void
1784
+ ptranspose(PacketBlock<Packet4f,4>& kernel) {
1785
+ ptranpose_common<Packet4f>(kernel);
669
1786
  }
670
1787
 
671
- template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
672
- {
673
- Packet4i b, res;
674
- b = vec_max(a, vec_sld(a, a, 8));
675
- res = vec_max(b, vec_sld(b, b, 4));
676
- return pfirst(res);
1788
+ EIGEN_DEVICE_FUNC inline void
1789
+ ptranspose(PacketBlock<Packet4i,4>& kernel) {
1790
+ ptranpose_common<Packet4i>(kernel);
677
1791
  }
678
1792
 
679
- template<int Offset>
680
- struct palign_impl<Offset,Packet4f>
681
- {
682
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
683
- {
684
- #ifdef _BIG_ENDIAN
685
- switch (Offset % 4) {
686
- case 1:
687
- first = vec_sld(first, second, 4); break;
688
- case 2:
689
- first = vec_sld(first, second, 8); break;
690
- case 3:
691
- first = vec_sld(first, second, 12); break;
692
- }
693
- #else
694
- switch (Offset % 4) {
695
- case 1:
696
- first = vec_sld(second, first, 12); break;
697
- case 2:
698
- first = vec_sld(second, first, 8); break;
699
- case 3:
700
- first = vec_sld(second, first, 4); break;
701
- }
702
- #endif
703
- }
704
- };
1793
+ EIGEN_DEVICE_FUNC inline void
1794
+ ptranspose(PacketBlock<Packet8s,4>& kernel) {
1795
+ Packet8s t0, t1, t2, t3;
1796
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1797
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1798
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1799
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1800
+ kernel.packet[0] = vec_mergeh(t0, t2);
1801
+ kernel.packet[1] = vec_mergel(t0, t2);
1802
+ kernel.packet[2] = vec_mergeh(t1, t3);
1803
+ kernel.packet[3] = vec_mergel(t1, t3);
1804
+ }
1805
+
1806
+ EIGEN_DEVICE_FUNC inline void
1807
+ ptranspose(PacketBlock<Packet8us,4>& kernel) {
1808
+ Packet8us t0, t1, t2, t3;
1809
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1810
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1811
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1812
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1813
+ kernel.packet[0] = vec_mergeh(t0, t2);
1814
+ kernel.packet[1] = vec_mergel(t0, t2);
1815
+ kernel.packet[2] = vec_mergeh(t1, t3);
1816
+ kernel.packet[3] = vec_mergel(t1, t3);
1817
+ }
705
1818
 
706
- template<int Offset>
707
- struct palign_impl<Offset,Packet4i>
708
- {
709
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
710
- {
711
- #ifdef _BIG_ENDIAN
712
- switch (Offset % 4) {
713
- case 1:
714
- first = vec_sld(first, second, 4); break;
715
- case 2:
716
- first = vec_sld(first, second, 8); break;
717
- case 3:
718
- first = vec_sld(first, second, 12); break;
719
- }
720
- #else
721
- switch (Offset % 4) {
722
- case 1:
723
- first = vec_sld(second, first, 12); break;
724
- case 2:
725
- first = vec_sld(second, first, 8); break;
726
- case 3:
727
- first = vec_sld(second, first, 4); break;
728
- }
729
- #endif
730
- }
731
- };
732
1819
 
733
1820
  EIGEN_DEVICE_FUNC inline void
734
- ptranspose(PacketBlock<Packet4f,4>& kernel) {
735
- Packet4f t0, t1, t2, t3;
1821
+ ptranspose(PacketBlock<Packet8bf,4>& kernel) {
1822
+ Packet8us t0, t1, t2, t3;
1823
+
1824
+ t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
1825
+ t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
1826
+ t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
1827
+ t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
1828
+ kernel.packet[0] = vec_mergeh(t0, t2);
1829
+ kernel.packet[1] = vec_mergel(t0, t2);
1830
+ kernel.packet[2] = vec_mergeh(t1, t3);
1831
+ kernel.packet[3] = vec_mergel(t1, t3);
1832
+ }
1833
+
1834
+ EIGEN_DEVICE_FUNC inline void
1835
+ ptranspose(PacketBlock<Packet16c,4>& kernel) {
1836
+ Packet16c t0, t1, t2, t3;
736
1837
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
737
1838
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
738
1839
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -743,9 +1844,10 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
743
1844
  kernel.packet[3] = vec_mergel(t1, t3);
744
1845
  }
745
1846
 
1847
+
746
1848
  EIGEN_DEVICE_FUNC inline void
747
- ptranspose(PacketBlock<Packet4i,4>& kernel) {
748
- Packet4i t0, t1, t2, t3;
1849
+ ptranspose(PacketBlock<Packet16uc,4>& kernel) {
1850
+ Packet16uc t0, t1, t2, t3;
749
1851
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
750
1852
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
751
1853
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -756,18 +1858,398 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
756
1858
  kernel.packet[3] = vec_mergel(t1, t3);
757
1859
  }
758
1860
 
759
- template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
1861
+ EIGEN_DEVICE_FUNC inline void
1862
+ ptranspose(PacketBlock<Packet8s,8>& kernel) {
1863
+ Packet8s v[8], sum[8];
1864
+
1865
+ v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1866
+ v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1867
+ v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1868
+ v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1869
+ v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1870
+ v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1871
+ v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1872
+ v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1873
+ sum[0] = vec_mergeh(v[0], v[4]);
1874
+ sum[1] = vec_mergel(v[0], v[4]);
1875
+ sum[2] = vec_mergeh(v[1], v[5]);
1876
+ sum[3] = vec_mergel(v[1], v[5]);
1877
+ sum[4] = vec_mergeh(v[2], v[6]);
1878
+ sum[5] = vec_mergel(v[2], v[6]);
1879
+ sum[6] = vec_mergeh(v[3], v[7]);
1880
+ sum[7] = vec_mergel(v[3], v[7]);
1881
+
1882
+ kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1883
+ kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1884
+ kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1885
+ kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1886
+ kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1887
+ kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1888
+ kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1889
+ kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1890
+ }
1891
+
1892
+ EIGEN_DEVICE_FUNC inline void
1893
+ ptranspose(PacketBlock<Packet8us,8>& kernel) {
1894
+ Packet8us v[8], sum[8];
1895
+
1896
+ v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1897
+ v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1898
+ v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1899
+ v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1900
+ v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1901
+ v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1902
+ v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1903
+ v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1904
+ sum[0] = vec_mergeh(v[0], v[4]);
1905
+ sum[1] = vec_mergel(v[0], v[4]);
1906
+ sum[2] = vec_mergeh(v[1], v[5]);
1907
+ sum[3] = vec_mergel(v[1], v[5]);
1908
+ sum[4] = vec_mergeh(v[2], v[6]);
1909
+ sum[5] = vec_mergel(v[2], v[6]);
1910
+ sum[6] = vec_mergeh(v[3], v[7]);
1911
+ sum[7] = vec_mergel(v[3], v[7]);
1912
+
1913
+ kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1914
+ kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1915
+ kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1916
+ kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1917
+ kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1918
+ kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1919
+ kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1920
+ kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1921
+ }
1922
+
1923
+ EIGEN_DEVICE_FUNC inline void
1924
+ ptranspose(PacketBlock<Packet8bf,8>& kernel) {
1925
+ Packet8bf v[8], sum[8];
1926
+
1927
+ v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
1928
+ v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
1929
+ v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
1930
+ v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
1931
+ v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
1932
+ v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
1933
+ v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
1934
+ v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
1935
+ sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
1936
+ sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
1937
+ sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
1938
+ sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
1939
+ sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
1940
+ sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
1941
+ sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
1942
+ sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
1943
+
1944
+ kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
1945
+ kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
1946
+ kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
1947
+ kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
1948
+ kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
1949
+ kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
1950
+ kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
1951
+ kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
1952
+ }
1953
+
1954
+ EIGEN_DEVICE_FUNC inline void
1955
+ ptranspose(PacketBlock<Packet16c,16>& kernel) {
1956
+ Packet16c step1[16], step2[16], step3[16];
1957
+
1958
+ step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
1959
+ step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
1960
+ step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
1961
+ step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
1962
+ step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
1963
+ step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
1964
+ step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
1965
+ step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
1966
+ step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
1967
+ step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
1968
+ step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
1969
+ step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
1970
+ step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
1971
+ step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
1972
+ step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
1973
+ step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
1974
+
1975
+ step2[0] = vec_mergeh(step1[0], step1[8]);
1976
+ step2[1] = vec_mergel(step1[0], step1[8]);
1977
+ step2[2] = vec_mergeh(step1[1], step1[9]);
1978
+ step2[3] = vec_mergel(step1[1], step1[9]);
1979
+ step2[4] = vec_mergeh(step1[2], step1[10]);
1980
+ step2[5] = vec_mergel(step1[2], step1[10]);
1981
+ step2[6] = vec_mergeh(step1[3], step1[11]);
1982
+ step2[7] = vec_mergel(step1[3], step1[11]);
1983
+ step2[8] = vec_mergeh(step1[4], step1[12]);
1984
+ step2[9] = vec_mergel(step1[4], step1[12]);
1985
+ step2[10] = vec_mergeh(step1[5], step1[13]);
1986
+ step2[11] = vec_mergel(step1[5], step1[13]);
1987
+ step2[12] = vec_mergeh(step1[6], step1[14]);
1988
+ step2[13] = vec_mergel(step1[6], step1[14]);
1989
+ step2[14] = vec_mergeh(step1[7], step1[15]);
1990
+ step2[15] = vec_mergel(step1[7], step1[15]);
1991
+
1992
+ step3[0] = vec_mergeh(step2[0], step2[8]);
1993
+ step3[1] = vec_mergel(step2[0], step2[8]);
1994
+ step3[2] = vec_mergeh(step2[1], step2[9]);
1995
+ step3[3] = vec_mergel(step2[1], step2[9]);
1996
+ step3[4] = vec_mergeh(step2[2], step2[10]);
1997
+ step3[5] = vec_mergel(step2[2], step2[10]);
1998
+ step3[6] = vec_mergeh(step2[3], step2[11]);
1999
+ step3[7] = vec_mergel(step2[3], step2[11]);
2000
+ step3[8] = vec_mergeh(step2[4], step2[12]);
2001
+ step3[9] = vec_mergel(step2[4], step2[12]);
2002
+ step3[10] = vec_mergeh(step2[5], step2[13]);
2003
+ step3[11] = vec_mergel(step2[5], step2[13]);
2004
+ step3[12] = vec_mergeh(step2[6], step2[14]);
2005
+ step3[13] = vec_mergel(step2[6], step2[14]);
2006
+ step3[14] = vec_mergeh(step2[7], step2[15]);
2007
+ step3[15] = vec_mergel(step2[7], step2[15]);
2008
+
2009
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2010
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2011
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2012
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2013
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2014
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2015
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2016
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2017
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2018
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2019
+ kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2020
+ kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2021
+ kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2022
+ kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2023
+ kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2024
+ kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2025
+ }
2026
+
2027
+ EIGEN_DEVICE_FUNC inline void
2028
+ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2029
+ Packet16uc step1[16], step2[16], step3[16];
2030
+
2031
+ step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2032
+ step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2033
+ step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2034
+ step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2035
+ step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2036
+ step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2037
+ step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2038
+ step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2039
+ step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2040
+ step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2041
+ step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2042
+ step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2043
+ step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2044
+ step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2045
+ step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2046
+ step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2047
+
2048
+ step2[0] = vec_mergeh(step1[0], step1[8]);
2049
+ step2[1] = vec_mergel(step1[0], step1[8]);
2050
+ step2[2] = vec_mergeh(step1[1], step1[9]);
2051
+ step2[3] = vec_mergel(step1[1], step1[9]);
2052
+ step2[4] = vec_mergeh(step1[2], step1[10]);
2053
+ step2[5] = vec_mergel(step1[2], step1[10]);
2054
+ step2[6] = vec_mergeh(step1[3], step1[11]);
2055
+ step2[7] = vec_mergel(step1[3], step1[11]);
2056
+ step2[8] = vec_mergeh(step1[4], step1[12]);
2057
+ step2[9] = vec_mergel(step1[4], step1[12]);
2058
+ step2[10] = vec_mergeh(step1[5], step1[13]);
2059
+ step2[11] = vec_mergel(step1[5], step1[13]);
2060
+ step2[12] = vec_mergeh(step1[6], step1[14]);
2061
+ step2[13] = vec_mergel(step1[6], step1[14]);
2062
+ step2[14] = vec_mergeh(step1[7], step1[15]);
2063
+ step2[15] = vec_mergel(step1[7], step1[15]);
2064
+
2065
+ step3[0] = vec_mergeh(step2[0], step2[8]);
2066
+ step3[1] = vec_mergel(step2[0], step2[8]);
2067
+ step3[2] = vec_mergeh(step2[1], step2[9]);
2068
+ step3[3] = vec_mergel(step2[1], step2[9]);
2069
+ step3[4] = vec_mergeh(step2[2], step2[10]);
2070
+ step3[5] = vec_mergel(step2[2], step2[10]);
2071
+ step3[6] = vec_mergeh(step2[3], step2[11]);
2072
+ step3[7] = vec_mergel(step2[3], step2[11]);
2073
+ step3[8] = vec_mergeh(step2[4], step2[12]);
2074
+ step3[9] = vec_mergel(step2[4], step2[12]);
2075
+ step3[10] = vec_mergeh(step2[5], step2[13]);
2076
+ step3[11] = vec_mergel(step2[5], step2[13]);
2077
+ step3[12] = vec_mergeh(step2[6], step2[14]);
2078
+ step3[13] = vec_mergel(step2[6], step2[14]);
2079
+ step3[14] = vec_mergeh(step2[7], step2[15]);
2080
+ step3[15] = vec_mergel(step2[7], step2[15]);
2081
+
2082
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2083
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2084
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2085
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2086
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2087
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2088
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2089
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2090
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2091
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2092
+ kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2093
+ kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2094
+ kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2095
+ kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2096
+ kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2097
+ kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2098
+ }
2099
+
2100
+ template<typename Packet> EIGEN_STRONG_INLINE
2101
+ Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
760
2102
  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
761
2103
  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
762
2104
  return vec_sel(elsePacket, thenPacket, mask);
763
2105
  }
764
2106
 
2107
+ template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
2108
+ return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
2109
+ }
2110
+
765
2111
  template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
766
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
767
- Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
2112
+ return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
2113
+ }
2114
+
2115
+ template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
2116
+ Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2117
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2118
+ Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
2119
+ Packet8s result = vec_sel(elsePacket, thenPacket, mask);
2120
+ return result;
2121
+ }
2122
+
2123
+ template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
2124
+ Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2125
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2126
+ Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
2127
+ return vec_sel(elsePacket, thenPacket, mask);
2128
+ }
2129
+
2130
+ template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
2131
+ return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
2132
+ }
2133
+
2134
+ template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
2135
+ Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2136
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2137
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2138
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2139
+
2140
+ Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
2141
+ return vec_sel(elsePacket, thenPacket, mask);
2142
+ }
2143
+
2144
+ template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
2145
+ Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2146
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2147
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2148
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2149
+
2150
+ Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
768
2151
  return vec_sel(elsePacket, thenPacket, mask);
769
2152
  }
770
2153
 
2154
+ template <>
2155
+ struct type_casting_traits<float, int> {
2156
+ enum {
2157
+ VectorizedCast = 1,
2158
+ SrcCoeffRatio = 1,
2159
+ TgtCoeffRatio = 1
2160
+ };
2161
+ };
2162
+
2163
+ template <>
2164
+ struct type_casting_traits<int, float> {
2165
+ enum {
2166
+ VectorizedCast = 1,
2167
+ SrcCoeffRatio = 1,
2168
+ TgtCoeffRatio = 1
2169
+ };
2170
+ };
2171
+
2172
+ template <>
2173
+ struct type_casting_traits<bfloat16, unsigned short int> {
2174
+ enum {
2175
+ VectorizedCast = 1,
2176
+ SrcCoeffRatio = 1,
2177
+ TgtCoeffRatio = 1
2178
+ };
2179
+ };
2180
+
2181
+ template <>
2182
+ struct type_casting_traits<unsigned short int, bfloat16> {
2183
+ enum {
2184
+ VectorizedCast = 1,
2185
+ SrcCoeffRatio = 1,
2186
+ TgtCoeffRatio = 1
2187
+ };
2188
+ };
2189
+
2190
+ template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
2191
+ return vec_cts(a,0);
2192
+ }
2193
+
2194
+ template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
2195
+ return vec_ctu(a,0);
2196
+ }
2197
+
2198
+ template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
2199
+ return vec_ctf(a,0);
2200
+ }
2201
+
2202
+ template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
2203
+ return vec_ctf(a,0);
2204
+ }
2205
+
2206
+ template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
2207
+ Packet4f float_even = Bf16ToF32Even(a);
2208
+ Packet4f float_odd = Bf16ToF32Odd(a);
2209
+ Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
2210
+ Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
2211
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2212
+ Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
2213
+ Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
2214
+
2215
+ //Check values that are bigger than USHRT_MAX (0xFFFF)
2216
+ Packet4bi overflow_selector;
2217
+ if(vec_any_gt(int_even, p4ui_low_mask)){
2218
+ overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
2219
+ low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2220
+ }
2221
+ if(vec_any_gt(int_odd, p4ui_low_mask)){
2222
+ overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
2223
+ low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2224
+ }
2225
+
2226
+ low_odd = plogical_shift_left<16>(low_odd);
2227
+
2228
+ Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
2229
+ return reinterpret_cast<Packet8us>(int_final);
2230
+ }
2231
+
2232
+ template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
2233
+ //short -> int -> float -> bfloat16
2234
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2235
+ Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
2236
+ Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
2237
+ Packet4ui int_odd = plogical_shift_right<16>(int_cast);
2238
+ Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
2239
+ Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
2240
+ return F32ToBf16(float_even, float_odd);
2241
+ }
2242
+
2243
+
2244
+ template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
2245
+ return reinterpret_cast<Packet4i>(a);
2246
+ }
2247
+
2248
+ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
2249
+ return reinterpret_cast<Packet4f>(a);
2250
+ }
2251
+
2252
+
771
2253
 
772
2254
  //---------- double ----------
773
2255
  #ifdef __VSX__
@@ -782,9 +2264,12 @@ typedef __vector __bool long Packet2bl;
782
2264
 
783
2265
  static Packet2l p2l_ONE = { 1, 1 };
784
2266
  static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
2267
+ static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
2268
+ static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
785
2269
  static Packet2d p2d_ONE = { 1.0, 1.0 };
786
2270
  static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
787
- static Packet2d p2d_MZERO = { -0.0, -0.0 };
2271
+ static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
2272
+ numext::bit_cast<double>(0x8000000000000000ull) };
788
2273
 
789
2274
  #ifdef _BIG_ENDIAN
790
2275
  static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -792,16 +2277,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_c
792
2277
  static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
793
2278
  #endif
794
2279
 
795
- template<int index> Packet2d vec_splat_dbl(Packet2d& a);
796
-
797
- template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
798
- {
799
- return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
800
- }
801
-
802
- template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
2280
+ template<int index> Packet2d vec_splat_dbl(Packet2d& a)
803
2281
  {
804
- return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
2282
+ return vec_splat(a, index);
805
2283
  }
806
2284
 
807
2285
  template<> struct packet_traits<double> : default_packet_traits
@@ -830,12 +2308,13 @@ template<> struct packet_traits<double> : default_packet_traits
830
2308
  HasRound = 1,
831
2309
  HasFloor = 1,
832
2310
  HasCeil = 1,
2311
+ HasRint = 1,
833
2312
  HasNegate = 1,
834
2313
  HasBlend = 1
835
2314
  };
836
2315
  };
837
2316
 
838
- template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
2317
+ template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
839
2318
 
840
2319
  inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
841
2320
  {
@@ -863,21 +2342,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
863
2342
  template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
864
2343
  {
865
2344
  EIGEN_DEBUG_ALIGNED_LOAD
866
- #ifdef __VSX__
867
- return vec_vsx_ld(0, from);
868
- #else
869
- return vec_ld(0, from);
870
- #endif
2345
+ return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
871
2346
  }
872
2347
 
873
2348
  template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
874
2349
  {
875
2350
  EIGEN_DEBUG_ALIGNED_STORE
876
- #ifdef __VSX__
877
- vec_vsx_st(from, 0, to);
878
- #else
879
- vec_st(from, 0, to);
880
- #endif
2351
+ vec_xst(from, 0, to);
881
2352
  }
882
2353
 
883
2354
  template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
@@ -885,28 +2356,32 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
885
2356
  return v;
886
2357
  }
887
2358
 
2359
+ template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
2360
+ Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
2361
+ return reinterpret_cast<Packet2d>(v);
2362
+ }
2363
+
888
2364
  template<> EIGEN_STRONG_INLINE void
889
2365
  pbroadcast4<Packet2d>(const double *a,
890
2366
  Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
891
2367
  {
892
- a1 = pload<Packet2d>(a);
893
- a0 = vec_splat_dbl<0>(a1);
894
- a1 = vec_splat_dbl<1>(a1);
895
- a3 = pload<Packet2d>(a+2);
896
- a2 = vec_splat_dbl<0>(a3);
897
- a3 = vec_splat_dbl<1>(a3);
2368
+ //This way is faster than vec_splat (at least for doubles in Power 9)
2369
+ a0 = pset1<Packet2d>(a[0]);
2370
+ a1 = pset1<Packet2d>(a[1]);
2371
+ a2 = pset1<Packet2d>(a[2]);
2372
+ a3 = pset1<Packet2d>(a[3]);
898
2373
  }
899
2374
 
900
2375
  template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
901
2376
  {
902
- double EIGEN_ALIGN16 af[2];
2377
+ EIGEN_ALIGN16 double af[2];
903
2378
  af[0] = from[0*stride];
904
2379
  af[1] = from[1*stride];
905
2380
  return pload<Packet2d>(af);
906
2381
  }
907
2382
  template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
908
2383
  {
909
- double EIGEN_ALIGN16 af[2];
2384
+ EIGEN_ALIGN16 double af[2];
910
2385
  pstore<double>(af, from);
911
2386
  to[0*stride] = af[0];
912
2387
  to[1*stride] = af[1];
@@ -930,6 +2405,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d&
930
2405
 
931
2406
  template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
932
2407
  {
2408
+ // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
933
2409
  Packet2d ret;
934
2410
  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
935
2411
  return ret;
@@ -937,11 +2413,20 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const
937
2413
 
938
2414
  template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
939
2415
  {
2416
+ // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
940
2417
  Packet2d ret;
941
2418
  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
942
2419
  return ret;
943
2420
  }
944
2421
 
2422
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
2423
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
2424
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
2425
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
2426
+ Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
2427
+ return vec_nor(c,c);
2428
+ }
2429
+
945
2430
  template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
946
2431
 
947
2432
  template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
@@ -950,14 +2435,34 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const
950
2435
 
951
2436
  template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
952
2437
 
953
- template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
2438
+ template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
2439
+ {
2440
+ Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
2441
+ Packet2d res;
2442
+
2443
+ __asm__("xvrdpiz %x0, %x1\n\t"
2444
+ : "=&wa" (res)
2445
+ : "wa" (t));
2446
+
2447
+ return res;
2448
+ }
954
2449
  template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
955
2450
  template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
2451
+ template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
2452
+ {
2453
+ Packet2d res;
2454
+
2455
+ __asm__("xvrdpic %x0, %x1\n\t"
2456
+ : "=&wa" (res)
2457
+ : "wa" (a));
2458
+
2459
+ return res;
2460
+ }
956
2461
 
957
2462
  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
958
2463
  {
959
- EIGEN_DEBUG_ALIGNED_LOAD
960
- return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
2464
+ EIGEN_DEBUG_UNALIGNED_LOAD
2465
+ return vec_xl(0, const_cast<double*>(from));
961
2466
  }
962
2467
 
963
2468
  template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
@@ -970,13 +2475,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
970
2475
 
971
2476
  template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
972
2477
  {
973
- EIGEN_DEBUG_ALIGNED_STORE
974
- vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
2478
+ EIGEN_DEBUG_UNALIGNED_STORE
2479
+ vec_xst(from, 0, to);
975
2480
  }
976
2481
 
977
2482
  template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
978
2483
 
979
- template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
2484
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
980
2485
 
981
2486
  template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
982
2487
  {
@@ -984,6 +2489,177 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
984
2489
  }
985
2490
  template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
986
2491
 
2492
+ // VSX support varies between different compilers and even different
2493
+ // versions of the same compiler. For gcc version >= 4.9.3, we can use
2494
+ // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
2495
+ // a slow version that works with older compilers.
2496
+ // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
2497
+ // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
2498
+ template<>
2499
+ inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
2500
+ #if EIGEN_GNUC_AT_LEAST(5, 4) || \
2501
+ (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
2502
+ return vec_cts(x, 0); // TODO: check clang version.
2503
+ #else
2504
+ double tmp[2];
2505
+ memcpy(tmp, &x, sizeof(tmp));
2506
+ Packet2l l = { static_cast<long long>(tmp[0]),
2507
+ static_cast<long long>(tmp[1]) };
2508
+ return l;
2509
+ #endif
2510
+ }
2511
+
2512
+ template<>
2513
+ inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
2514
+ unsigned long long tmp[2];
2515
+ memcpy(tmp, &x, sizeof(tmp));
2516
+ Packet2d d = { static_cast<double>(tmp[0]),
2517
+ static_cast<double>(tmp[1]) };
2518
+ return d;
2519
+ }
2520
+
2521
+
2522
+ // Packet2l shifts.
2523
+ // For POWER8 we simply use vec_sr/l.
2524
+ //
2525
+ // Things are more complicated for POWER7. There is actually a
2526
+ // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
2527
+ // So we need to shift by N % 32 and rearrage bytes.
2528
+ #ifdef __POWER8_VECTOR__
2529
+
2530
+ template<int N>
2531
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2532
+ const Packet2ul shift = { N, N };
2533
+ return vec_sl(a, shift);
2534
+ }
2535
+
2536
+ template<int N>
2537
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2538
+ const Packet2ul shift = { N, N };
2539
+ return vec_sr(a, shift);
2540
+ }
2541
+
2542
+ #else
2543
+
2544
+ // Shifts [A, B, C, D] to [B, 0, D, 0].
2545
+ // Used to implement left shifts for Packet2l.
2546
+ EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
2547
+ static const Packet16uc perm = {
2548
+ 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
2549
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
2550
+ #ifdef _BIG_ENDIAN
2551
+ return vec_perm(p4i_ZERO, a, perm);
2552
+ #else
2553
+ return vec_perm(a, p4i_ZERO, perm);
2554
+ #endif
2555
+ }
2556
+
2557
+ // Shifts [A, B, C, D] to [0, A, 0, C].
2558
+ // Used to implement right shifts for Packet2l.
2559
+ EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
2560
+ static const Packet16uc perm = {
2561
+ 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
2562
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
2563
+ #ifdef _BIG_ENDIAN
2564
+ return vec_perm(p4i_ZERO, a, perm);
2565
+ #else
2566
+ return vec_perm(a, p4i_ZERO, perm);
2567
+ #endif
2568
+ }
2569
+
2570
+ template<int N, typename EnableIf = void>
2571
+ struct plogical_shift_left_impl;
2572
+
2573
+ template<int N>
2574
+ struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
2575
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2576
+ static const unsigned n = static_cast<unsigned>(N);
2577
+ const Packet4ui shift = {n, n, n, n};
2578
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2579
+ static const unsigned m = static_cast<unsigned>(32 - N);
2580
+ const Packet4ui shift_right = {m, m, m, m};
2581
+ const Packet4i out_hi = vec_sl(ai, shift);
2582
+ const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
2583
+ return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
2584
+ }
2585
+ };
2586
+
2587
+ template<int N>
2588
+ struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
2589
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2590
+ static const unsigned m = static_cast<unsigned>(N - 32);
2591
+ const Packet4ui shift = {m, m, m, m};
2592
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2593
+ return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
2594
+ }
2595
+ };
2596
+
2597
+ template<int N>
2598
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2599
+ return plogical_shift_left_impl<N>::run(a);
2600
+ }
2601
+
2602
+ template<int N, typename EnableIf = void>
2603
+ struct plogical_shift_right_impl;
2604
+
2605
+ template<int N>
2606
+ struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
2607
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2608
+ static const unsigned n = static_cast<unsigned>(N);
2609
+ const Packet4ui shift = {n, n, n, n};
2610
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2611
+ static const unsigned m = static_cast<unsigned>(32 - N);
2612
+ const Packet4ui shift_left = {m, m, m, m};
2613
+ const Packet4i out_lo = vec_sr(ai, shift);
2614
+ const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
2615
+ return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
2616
+ }
2617
+ };
2618
+
2619
+ template<int N>
2620
+ struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
2621
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2622
+ static const unsigned m = static_cast<unsigned>(N - 32);
2623
+ const Packet4ui shift = {m, m, m, m};
2624
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2625
+ return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
2626
+ }
2627
+ };
2628
+
2629
+ template<int N>
2630
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2631
+ return plogical_shift_right_impl<N>::run(a);
2632
+ }
2633
+ #endif
2634
+
2635
+ template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2636
+ // Clamp exponent to [-2099, 2099]
2637
+ const Packet2d max_exponent = pset1<Packet2d>(2099.0);
2638
+ const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
2639
+
2640
+ // Split 2^e into four factors and multiply:
2641
+ const Packet2l bias = { 1023, 1023 };
2642
+ Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
2643
+ Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
2644
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
2645
+ b = psub(psub(psub(e, b), b), b); // e - 3b
2646
+ c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
2647
+ out = pmul(out, c); // a * 2^e
2648
+ return out;
2649
+ }
2650
+
2651
+
2652
+ // Extract exponent without existence of Packet2l.
2653
+ template<>
2654
+ EIGEN_STRONG_INLINE
2655
+ Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
2656
+ return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
2657
+ }
2658
+
2659
+ template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
2660
+ return pfrexp_generic(a, exponent);
2661
+ }
2662
+
987
2663
  template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
988
2664
  {
989
2665
  Packet2d b, sum;
@@ -992,20 +2668,6 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
992
2668
  return pfirst<Packet2d>(sum);
993
2669
  }
994
2670
 
995
- template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
996
- {
997
- Packet2d v[2], sum;
998
- v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
999
- v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
1000
-
1001
- #ifdef _BIG_ENDIAN
1002
- sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
1003
- #else
1004
- sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
1005
- #endif
1006
-
1007
- return sum;
1008
- }
1009
2671
  // Other reduction functions:
1010
2672
  // mul
1011
2673
  template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
@@ -1025,20 +2687,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
1025
2687
  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
1026
2688
  }
1027
2689
 
1028
- template<int Offset>
1029
- struct palign_impl<Offset,Packet2d>
1030
- {
1031
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
1032
- {
1033
- if (Offset == 1)
1034
- #ifdef _BIG_ENDIAN
1035
- first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
1036
- #else
1037
- first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
1038
- #endif
1039
- }
1040
- };
1041
-
1042
2690
  EIGEN_DEVICE_FUNC inline void
1043
2691
  ptranspose(PacketBlock<Packet2d,2>& kernel) {
1044
2692
  Packet2d t0, t1;
@@ -1053,6 +2701,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
1053
2701
  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
1054
2702
  return vec_sel(elsePacket, thenPacket, mask);
1055
2703
  }
2704
+
2705
+
1056
2706
  #endif // __VSX__
1057
2707
  } // end namespace internal
1058
2708