umappp 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (395) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +25 -0
  3. data/README.md +110 -0
  4. data/ext/umappp/extconf.rb +25 -0
  5. data/ext/umappp/numo.hpp +867 -0
  6. data/ext/umappp/umappp.cpp +225 -0
  7. data/lib/umappp/version.rb +5 -0
  8. data/lib/umappp.rb +41 -0
  9. data/vendor/Eigen/Cholesky +45 -0
  10. data/vendor/Eigen/CholmodSupport +48 -0
  11. data/vendor/Eigen/Core +384 -0
  12. data/vendor/Eigen/Dense +7 -0
  13. data/vendor/Eigen/Eigen +2 -0
  14. data/vendor/Eigen/Eigenvalues +60 -0
  15. data/vendor/Eigen/Geometry +59 -0
  16. data/vendor/Eigen/Householder +29 -0
  17. data/vendor/Eigen/IterativeLinearSolvers +48 -0
  18. data/vendor/Eigen/Jacobi +32 -0
  19. data/vendor/Eigen/KLUSupport +41 -0
  20. data/vendor/Eigen/LU +47 -0
  21. data/vendor/Eigen/MetisSupport +35 -0
  22. data/vendor/Eigen/OrderingMethods +70 -0
  23. data/vendor/Eigen/PaStiXSupport +49 -0
  24. data/vendor/Eigen/PardisoSupport +35 -0
  25. data/vendor/Eigen/QR +50 -0
  26. data/vendor/Eigen/QtAlignedMalloc +39 -0
  27. data/vendor/Eigen/SPQRSupport +34 -0
  28. data/vendor/Eigen/SVD +50 -0
  29. data/vendor/Eigen/Sparse +34 -0
  30. data/vendor/Eigen/SparseCholesky +37 -0
  31. data/vendor/Eigen/SparseCore +69 -0
  32. data/vendor/Eigen/SparseLU +50 -0
  33. data/vendor/Eigen/SparseQR +36 -0
  34. data/vendor/Eigen/StdDeque +27 -0
  35. data/vendor/Eigen/StdList +26 -0
  36. data/vendor/Eigen/StdVector +27 -0
  37. data/vendor/Eigen/SuperLUSupport +64 -0
  38. data/vendor/Eigen/UmfPackSupport +40 -0
  39. data/vendor/Eigen/src/Cholesky/LDLT.h +688 -0
  40. data/vendor/Eigen/src/Cholesky/LLT.h +558 -0
  41. data/vendor/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  42. data/vendor/Eigen/src/CholmodSupport/CholmodSupport.h +682 -0
  43. data/vendor/Eigen/src/Core/ArithmeticSequence.h +413 -0
  44. data/vendor/Eigen/src/Core/Array.h +417 -0
  45. data/vendor/Eigen/src/Core/ArrayBase.h +226 -0
  46. data/vendor/Eigen/src/Core/ArrayWrapper.h +209 -0
  47. data/vendor/Eigen/src/Core/Assign.h +90 -0
  48. data/vendor/Eigen/src/Core/AssignEvaluator.h +1010 -0
  49. data/vendor/Eigen/src/Core/Assign_MKL.h +178 -0
  50. data/vendor/Eigen/src/Core/BandMatrix.h +353 -0
  51. data/vendor/Eigen/src/Core/Block.h +448 -0
  52. data/vendor/Eigen/src/Core/BooleanRedux.h +162 -0
  53. data/vendor/Eigen/src/Core/CommaInitializer.h +164 -0
  54. data/vendor/Eigen/src/Core/ConditionEstimator.h +175 -0
  55. data/vendor/Eigen/src/Core/CoreEvaluators.h +1741 -0
  56. data/vendor/Eigen/src/Core/CoreIterators.h +132 -0
  57. data/vendor/Eigen/src/Core/CwiseBinaryOp.h +183 -0
  58. data/vendor/Eigen/src/Core/CwiseNullaryOp.h +1001 -0
  59. data/vendor/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  60. data/vendor/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  61. data/vendor/Eigen/src/Core/CwiseUnaryView.h +132 -0
  62. data/vendor/Eigen/src/Core/DenseBase.h +701 -0
  63. data/vendor/Eigen/src/Core/DenseCoeffsBase.h +685 -0
  64. data/vendor/Eigen/src/Core/DenseStorage.h +652 -0
  65. data/vendor/Eigen/src/Core/Diagonal.h +258 -0
  66. data/vendor/Eigen/src/Core/DiagonalMatrix.h +391 -0
  67. data/vendor/Eigen/src/Core/DiagonalProduct.h +28 -0
  68. data/vendor/Eigen/src/Core/Dot.h +318 -0
  69. data/vendor/Eigen/src/Core/EigenBase.h +160 -0
  70. data/vendor/Eigen/src/Core/ForceAlignedAccess.h +150 -0
  71. data/vendor/Eigen/src/Core/Fuzzy.h +155 -0
  72. data/vendor/Eigen/src/Core/GeneralProduct.h +465 -0
  73. data/vendor/Eigen/src/Core/GenericPacketMath.h +1040 -0
  74. data/vendor/Eigen/src/Core/GlobalFunctions.h +194 -0
  75. data/vendor/Eigen/src/Core/IO.h +258 -0
  76. data/vendor/Eigen/src/Core/IndexedView.h +237 -0
  77. data/vendor/Eigen/src/Core/Inverse.h +117 -0
  78. data/vendor/Eigen/src/Core/Map.h +171 -0
  79. data/vendor/Eigen/src/Core/MapBase.h +310 -0
  80. data/vendor/Eigen/src/Core/MathFunctions.h +2057 -0
  81. data/vendor/Eigen/src/Core/MathFunctionsImpl.h +200 -0
  82. data/vendor/Eigen/src/Core/Matrix.h +565 -0
  83. data/vendor/Eigen/src/Core/MatrixBase.h +547 -0
  84. data/vendor/Eigen/src/Core/NestByValue.h +85 -0
  85. data/vendor/Eigen/src/Core/NoAlias.h +109 -0
  86. data/vendor/Eigen/src/Core/NumTraits.h +335 -0
  87. data/vendor/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  88. data/vendor/Eigen/src/Core/PermutationMatrix.h +605 -0
  89. data/vendor/Eigen/src/Core/PlainObjectBase.h +1128 -0
  90. data/vendor/Eigen/src/Core/Product.h +191 -0
  91. data/vendor/Eigen/src/Core/ProductEvaluators.h +1179 -0
  92. data/vendor/Eigen/src/Core/Random.h +218 -0
  93. data/vendor/Eigen/src/Core/Redux.h +515 -0
  94. data/vendor/Eigen/src/Core/Ref.h +381 -0
  95. data/vendor/Eigen/src/Core/Replicate.h +142 -0
  96. data/vendor/Eigen/src/Core/Reshaped.h +454 -0
  97. data/vendor/Eigen/src/Core/ReturnByValue.h +119 -0
  98. data/vendor/Eigen/src/Core/Reverse.h +217 -0
  99. data/vendor/Eigen/src/Core/Select.h +164 -0
  100. data/vendor/Eigen/src/Core/SelfAdjointView.h +365 -0
  101. data/vendor/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  102. data/vendor/Eigen/src/Core/Solve.h +188 -0
  103. data/vendor/Eigen/src/Core/SolveTriangular.h +235 -0
  104. data/vendor/Eigen/src/Core/SolverBase.h +168 -0
  105. data/vendor/Eigen/src/Core/StableNorm.h +251 -0
  106. data/vendor/Eigen/src/Core/StlIterators.h +463 -0
  107. data/vendor/Eigen/src/Core/Stride.h +116 -0
  108. data/vendor/Eigen/src/Core/Swap.h +68 -0
  109. data/vendor/Eigen/src/Core/Transpose.h +464 -0
  110. data/vendor/Eigen/src/Core/Transpositions.h +386 -0
  111. data/vendor/Eigen/src/Core/TriangularMatrix.h +1001 -0
  112. data/vendor/Eigen/src/Core/VectorBlock.h +96 -0
  113. data/vendor/Eigen/src/Core/VectorwiseOp.h +784 -0
  114. data/vendor/Eigen/src/Core/Visitor.h +381 -0
  115. data/vendor/Eigen/src/Core/arch/AVX/Complex.h +372 -0
  116. data/vendor/Eigen/src/Core/arch/AVX/MathFunctions.h +228 -0
  117. data/vendor/Eigen/src/Core/arch/AVX/PacketMath.h +1574 -0
  118. data/vendor/Eigen/src/Core/arch/AVX/TypeCasting.h +115 -0
  119. data/vendor/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  120. data/vendor/Eigen/src/Core/arch/AVX512/MathFunctions.h +362 -0
  121. data/vendor/Eigen/src/Core/arch/AVX512/PacketMath.h +2303 -0
  122. data/vendor/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  123. data/vendor/Eigen/src/Core/arch/AltiVec/Complex.h +417 -0
  124. data/vendor/Eigen/src/Core/arch/AltiVec/MathFunctions.h +90 -0
  125. data/vendor/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  126. data/vendor/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  127. data/vendor/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  128. data/vendor/Eigen/src/Core/arch/AltiVec/PacketMath.h +2711 -0
  129. data/vendor/Eigen/src/Core/arch/CUDA/Complex.h +258 -0
  130. data/vendor/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  131. data/vendor/Eigen/src/Core/arch/Default/ConjHelper.h +117 -0
  132. data/vendor/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  133. data/vendor/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  134. data/vendor/Eigen/src/Core/arch/Default/Half.h +942 -0
  135. data/vendor/Eigen/src/Core/arch/Default/Settings.h +49 -0
  136. data/vendor/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  137. data/vendor/Eigen/src/Core/arch/GPU/MathFunctions.h +103 -0
  138. data/vendor/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  139. data/vendor/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  140. data/vendor/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  141. data/vendor/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  142. data/vendor/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  143. data/vendor/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  144. data/vendor/Eigen/src/Core/arch/NEON/Complex.h +584 -0
  145. data/vendor/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  146. data/vendor/Eigen/src/Core/arch/NEON/MathFunctions.h +75 -0
  147. data/vendor/Eigen/src/Core/arch/NEON/PacketMath.h +4587 -0
  148. data/vendor/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  149. data/vendor/Eigen/src/Core/arch/SSE/Complex.h +351 -0
  150. data/vendor/Eigen/src/Core/arch/SSE/MathFunctions.h +199 -0
  151. data/vendor/Eigen/src/Core/arch/SSE/PacketMath.h +1505 -0
  152. data/vendor/Eigen/src/Core/arch/SSE/TypeCasting.h +142 -0
  153. data/vendor/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  154. data/vendor/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  155. data/vendor/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  156. data/vendor/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  157. data/vendor/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  158. data/vendor/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  159. data/vendor/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  160. data/vendor/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  161. data/vendor/Eigen/src/Core/arch/ZVector/Complex.h +426 -0
  162. data/vendor/Eigen/src/Core/arch/ZVector/MathFunctions.h +233 -0
  163. data/vendor/Eigen/src/Core/arch/ZVector/PacketMath.h +1060 -0
  164. data/vendor/Eigen/src/Core/functors/AssignmentFunctors.h +177 -0
  165. data/vendor/Eigen/src/Core/functors/BinaryFunctors.h +541 -0
  166. data/vendor/Eigen/src/Core/functors/NullaryFunctors.h +189 -0
  167. data/vendor/Eigen/src/Core/functors/StlFunctors.h +166 -0
  168. data/vendor/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  169. data/vendor/Eigen/src/Core/functors/UnaryFunctors.h +1131 -0
  170. data/vendor/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2645 -0
  171. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrix.h +517 -0
  172. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +317 -0
  173. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  174. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +124 -0
  175. data/vendor/Eigen/src/Core/products/GeneralMatrixVector.h +518 -0
  176. data/vendor/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  177. data/vendor/Eigen/src/Core/products/Parallelizer.h +180 -0
  178. data/vendor/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +544 -0
  179. data/vendor/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +295 -0
  180. data/vendor/Eigen/src/Core/products/SelfadjointMatrixVector.h +262 -0
  181. data/vendor/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  182. data/vendor/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  183. data/vendor/Eigen/src/Core/products/SelfadjointRank2Update.h +94 -0
  184. data/vendor/Eigen/src/Core/products/TriangularMatrixMatrix.h +472 -0
  185. data/vendor/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +317 -0
  186. data/vendor/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  187. data/vendor/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  188. data/vendor/Eigen/src/Core/products/TriangularSolverMatrix.h +337 -0
  189. data/vendor/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +167 -0
  190. data/vendor/Eigen/src/Core/products/TriangularSolverVector.h +148 -0
  191. data/vendor/Eigen/src/Core/util/BlasUtil.h +583 -0
  192. data/vendor/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  193. data/vendor/Eigen/src/Core/util/Constants.h +563 -0
  194. data/vendor/Eigen/src/Core/util/DisableStupidWarnings.h +106 -0
  195. data/vendor/Eigen/src/Core/util/ForwardDeclarations.h +322 -0
  196. data/vendor/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  197. data/vendor/Eigen/src/Core/util/IntegralConstant.h +272 -0
  198. data/vendor/Eigen/src/Core/util/MKL_support.h +137 -0
  199. data/vendor/Eigen/src/Core/util/Macros.h +1464 -0
  200. data/vendor/Eigen/src/Core/util/Memory.h +1163 -0
  201. data/vendor/Eigen/src/Core/util/Meta.h +812 -0
  202. data/vendor/Eigen/src/Core/util/NonMPL2.h +3 -0
  203. data/vendor/Eigen/src/Core/util/ReenableStupidWarnings.h +31 -0
  204. data/vendor/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  205. data/vendor/Eigen/src/Core/util/StaticAssert.h +221 -0
  206. data/vendor/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  207. data/vendor/Eigen/src/Core/util/XprHelper.h +856 -0
  208. data/vendor/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  209. data/vendor/Eigen/src/Eigenvalues/ComplexSchur.h +462 -0
  210. data/vendor/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  211. data/vendor/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  212. data/vendor/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  213. data/vendor/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  214. data/vendor/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  215. data/vendor/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  216. data/vendor/Eigen/src/Eigenvalues/RealQZ.h +657 -0
  217. data/vendor/Eigen/src/Eigenvalues/RealSchur.h +558 -0
  218. data/vendor/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  219. data/vendor/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +904 -0
  220. data/vendor/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  221. data/vendor/Eigen/src/Eigenvalues/Tridiagonalization.h +561 -0
  222. data/vendor/Eigen/src/Geometry/AlignedBox.h +486 -0
  223. data/vendor/Eigen/src/Geometry/AngleAxis.h +247 -0
  224. data/vendor/Eigen/src/Geometry/EulerAngles.h +114 -0
  225. data/vendor/Eigen/src/Geometry/Homogeneous.h +501 -0
  226. data/vendor/Eigen/src/Geometry/Hyperplane.h +282 -0
  227. data/vendor/Eigen/src/Geometry/OrthoMethods.h +235 -0
  228. data/vendor/Eigen/src/Geometry/ParametrizedLine.h +232 -0
  229. data/vendor/Eigen/src/Geometry/Quaternion.h +870 -0
  230. data/vendor/Eigen/src/Geometry/Rotation2D.h +199 -0
  231. data/vendor/Eigen/src/Geometry/RotationBase.h +206 -0
  232. data/vendor/Eigen/src/Geometry/Scaling.h +188 -0
  233. data/vendor/Eigen/src/Geometry/Transform.h +1563 -0
  234. data/vendor/Eigen/src/Geometry/Translation.h +202 -0
  235. data/vendor/Eigen/src/Geometry/Umeyama.h +166 -0
  236. data/vendor/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  237. data/vendor/Eigen/src/Householder/BlockHouseholder.h +110 -0
  238. data/vendor/Eigen/src/Householder/Householder.h +176 -0
  239. data/vendor/Eigen/src/Householder/HouseholderSequence.h +545 -0
  240. data/vendor/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  241. data/vendor/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +212 -0
  242. data/vendor/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +229 -0
  243. data/vendor/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +394 -0
  244. data/vendor/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +453 -0
  245. data/vendor/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +444 -0
  246. data/vendor/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +198 -0
  247. data/vendor/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +117 -0
  248. data/vendor/Eigen/src/Jacobi/Jacobi.h +483 -0
  249. data/vendor/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  250. data/vendor/Eigen/src/LU/Determinant.h +117 -0
  251. data/vendor/Eigen/src/LU/FullPivLU.h +877 -0
  252. data/vendor/Eigen/src/LU/InverseImpl.h +432 -0
  253. data/vendor/Eigen/src/LU/PartialPivLU.h +624 -0
  254. data/vendor/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  255. data/vendor/Eigen/src/LU/arch/InverseSize4.h +351 -0
  256. data/vendor/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  257. data/vendor/Eigen/src/OrderingMethods/Amd.h +435 -0
  258. data/vendor/Eigen/src/OrderingMethods/Eigen_Colamd.h +1863 -0
  259. data/vendor/Eigen/src/OrderingMethods/Ordering.h +153 -0
  260. data/vendor/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  261. data/vendor/Eigen/src/PardisoSupport/PardisoSupport.h +545 -0
  262. data/vendor/Eigen/src/QR/ColPivHouseholderQR.h +674 -0
  263. data/vendor/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  264. data/vendor/Eigen/src/QR/CompleteOrthogonalDecomposition.h +635 -0
  265. data/vendor/Eigen/src/QR/FullPivHouseholderQR.h +713 -0
  266. data/vendor/Eigen/src/QR/HouseholderQR.h +434 -0
  267. data/vendor/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  268. data/vendor/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +335 -0
  269. data/vendor/Eigen/src/SVD/BDCSVD.h +1366 -0
  270. data/vendor/Eigen/src/SVD/JacobiSVD.h +812 -0
  271. data/vendor/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  272. data/vendor/Eigen/src/SVD/SVDBase.h +376 -0
  273. data/vendor/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  274. data/vendor/Eigen/src/SparseCholesky/SimplicialCholesky.h +697 -0
  275. data/vendor/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +174 -0
  276. data/vendor/Eigen/src/SparseCore/AmbiVector.h +378 -0
  277. data/vendor/Eigen/src/SparseCore/CompressedStorage.h +274 -0
  278. data/vendor/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  279. data/vendor/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  280. data/vendor/Eigen/src/SparseCore/SparseAssign.h +270 -0
  281. data/vendor/Eigen/src/SparseCore/SparseBlock.h +571 -0
  282. data/vendor/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  283. data/vendor/Eigen/src/SparseCore/SparseCompressedBase.h +370 -0
  284. data/vendor/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +722 -0
  285. data/vendor/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +150 -0
  286. data/vendor/Eigen/src/SparseCore/SparseDenseProduct.h +342 -0
  287. data/vendor/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  288. data/vendor/Eigen/src/SparseCore/SparseDot.h +98 -0
  289. data/vendor/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  290. data/vendor/Eigen/src/SparseCore/SparseMap.h +305 -0
  291. data/vendor/Eigen/src/SparseCore/SparseMatrix.h +1518 -0
  292. data/vendor/Eigen/src/SparseCore/SparseMatrixBase.h +398 -0
  293. data/vendor/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  294. data/vendor/Eigen/src/SparseCore/SparseProduct.h +181 -0
  295. data/vendor/Eigen/src/SparseCore/SparseRedux.h +49 -0
  296. data/vendor/Eigen/src/SparseCore/SparseRef.h +397 -0
  297. data/vendor/Eigen/src/SparseCore/SparseSelfAdjointView.h +659 -0
  298. data/vendor/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  299. data/vendor/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  300. data/vendor/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  301. data/vendor/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  302. data/vendor/Eigen/src/SparseCore/SparseUtil.h +186 -0
  303. data/vendor/Eigen/src/SparseCore/SparseVector.h +478 -0
  304. data/vendor/Eigen/src/SparseCore/SparseView.h +254 -0
  305. data/vendor/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  306. data/vendor/Eigen/src/SparseLU/SparseLU.h +923 -0
  307. data/vendor/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  308. data/vendor/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  309. data/vendor/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  310. data/vendor/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +375 -0
  311. data/vendor/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  312. data/vendor/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  313. data/vendor/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  314. data/vendor/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  315. data/vendor/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  316. data/vendor/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  317. data/vendor/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  318. data/vendor/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  319. data/vendor/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  320. data/vendor/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  321. data/vendor/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  322. data/vendor/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  323. data/vendor/Eigen/src/SparseQR/SparseQR.h +758 -0
  324. data/vendor/Eigen/src/StlSupport/StdDeque.h +116 -0
  325. data/vendor/Eigen/src/StlSupport/StdList.h +106 -0
  326. data/vendor/Eigen/src/StlSupport/StdVector.h +131 -0
  327. data/vendor/Eigen/src/StlSupport/details.h +84 -0
  328. data/vendor/Eigen/src/SuperLUSupport/SuperLUSupport.h +1025 -0
  329. data/vendor/Eigen/src/UmfPackSupport/UmfPackSupport.h +642 -0
  330. data/vendor/Eigen/src/misc/Image.h +82 -0
  331. data/vendor/Eigen/src/misc/Kernel.h +79 -0
  332. data/vendor/Eigen/src/misc/RealSvd2x2.h +55 -0
  333. data/vendor/Eigen/src/misc/blas.h +440 -0
  334. data/vendor/Eigen/src/misc/lapack.h +152 -0
  335. data/vendor/Eigen/src/misc/lapacke.h +16292 -0
  336. data/vendor/Eigen/src/misc/lapacke_mangling.h +17 -0
  337. data/vendor/Eigen/src/plugins/ArrayCwiseBinaryOps.h +358 -0
  338. data/vendor/Eigen/src/plugins/ArrayCwiseUnaryOps.h +696 -0
  339. data/vendor/Eigen/src/plugins/BlockMethods.h +1442 -0
  340. data/vendor/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  341. data/vendor/Eigen/src/plugins/CommonCwiseUnaryOps.h +177 -0
  342. data/vendor/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  343. data/vendor/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  344. data/vendor/Eigen/src/plugins/MatrixCwiseUnaryOps.h +95 -0
  345. data/vendor/Eigen/src/plugins/ReshapedMethods.h +149 -0
  346. data/vendor/aarand/aarand.hpp +114 -0
  347. data/vendor/annoy/annoylib.h +1495 -0
  348. data/vendor/annoy/kissrandom.h +120 -0
  349. data/vendor/annoy/mman.h +242 -0
  350. data/vendor/hnswlib/bruteforce.h +152 -0
  351. data/vendor/hnswlib/hnswalg.h +1192 -0
  352. data/vendor/hnswlib/hnswlib.h +108 -0
  353. data/vendor/hnswlib/space_ip.h +282 -0
  354. data/vendor/hnswlib/space_l2.h +281 -0
  355. data/vendor/hnswlib/visited_list_pool.h +79 -0
  356. data/vendor/irlba/irlba.hpp +575 -0
  357. data/vendor/irlba/lanczos.hpp +212 -0
  358. data/vendor/irlba/parallel.hpp +474 -0
  359. data/vendor/irlba/utils.hpp +224 -0
  360. data/vendor/irlba/wrappers.hpp +228 -0
  361. data/vendor/kmeans/Base.hpp +75 -0
  362. data/vendor/kmeans/Details.hpp +79 -0
  363. data/vendor/kmeans/HartiganWong.hpp +492 -0
  364. data/vendor/kmeans/InitializeKmeansPP.hpp +144 -0
  365. data/vendor/kmeans/InitializeNone.hpp +44 -0
  366. data/vendor/kmeans/InitializePCAPartition.hpp +309 -0
  367. data/vendor/kmeans/InitializeRandom.hpp +91 -0
  368. data/vendor/kmeans/Kmeans.hpp +161 -0
  369. data/vendor/kmeans/Lloyd.hpp +134 -0
  370. data/vendor/kmeans/MiniBatch.hpp +269 -0
  371. data/vendor/kmeans/QuickSearch.hpp +179 -0
  372. data/vendor/kmeans/compute_centroids.hpp +32 -0
  373. data/vendor/kmeans/compute_wcss.hpp +27 -0
  374. data/vendor/kmeans/is_edge_case.hpp +42 -0
  375. data/vendor/kmeans/random.hpp +55 -0
  376. data/vendor/knncolle/Annoy/Annoy.hpp +193 -0
  377. data/vendor/knncolle/BruteForce/BruteForce.hpp +120 -0
  378. data/vendor/knncolle/Hnsw/Hnsw.hpp +225 -0
  379. data/vendor/knncolle/Kmknn/Kmknn.hpp +286 -0
  380. data/vendor/knncolle/VpTree/VpTree.hpp +256 -0
  381. data/vendor/knncolle/knncolle.hpp +34 -0
  382. data/vendor/knncolle/utils/Base.hpp +100 -0
  383. data/vendor/knncolle/utils/NeighborQueue.hpp +94 -0
  384. data/vendor/knncolle/utils/distances.hpp +98 -0
  385. data/vendor/knncolle/utils/find_nearest_neighbors.hpp +112 -0
  386. data/vendor/powerit/PowerIterations.hpp +157 -0
  387. data/vendor/umappp/NeighborList.hpp +37 -0
  388. data/vendor/umappp/Umap.hpp +662 -0
  389. data/vendor/umappp/combine_neighbor_sets.hpp +95 -0
  390. data/vendor/umappp/find_ab.hpp +157 -0
  391. data/vendor/umappp/neighbor_similarities.hpp +136 -0
  392. data/vendor/umappp/optimize_layout.hpp +285 -0
  393. data/vendor/umappp/spectral_init.hpp +181 -0
  394. data/vendor/umappp/umappp.hpp +13 -0
  395. metadata +465 -0
@@ -0,0 +1,2711 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11
+ #define EIGEN_PACKET_MATH_ALTIVEC_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
19
+ #endif
20
+
21
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
22
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
23
+ #endif
24
+
25
+ // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
26
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
27
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28
+ #endif
29
+
30
+ typedef __vector float Packet4f;
31
+ typedef __vector int Packet4i;
32
+ typedef __vector unsigned int Packet4ui;
33
+ typedef __vector __bool int Packet4bi;
34
+ typedef __vector short int Packet8s;
35
+ typedef __vector unsigned short int Packet8us;
36
+ typedef __vector signed char Packet16c;
37
+ typedef __vector unsigned char Packet16uc;
38
+ typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
39
+
40
+ // We don't want to write the same code all the time, but we need to reuse the constants
41
+ // and it doesn't really work to declare them global, so we define macros instead
42
+ #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
43
+ Packet4f p4f_##NAME = {X, X, X, X}
44
+
45
+ #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
46
+ Packet4i p4i_##NAME = vec_splat_s32(X)
47
+
48
+ #define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
49
+ Packet4ui p4ui_##NAME = {X, X, X, X}
50
+
51
+ #define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
52
+ Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53
+
54
+ #define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
55
+ Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56
+
57
+ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
58
+ Packet4f p4f_##NAME = pset1<Packet4f>(X)
59
+
60
+ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
61
+ Packet4i p4i_##NAME = pset1<Packet4i>(X)
62
+
63
+ #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
64
+ Packet2d p2d_##NAME = pset1<Packet2d>(X)
65
+
66
+ #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
67
+ Packet2l p2l_##NAME = pset1<Packet2l>(X)
68
+
69
+ #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
70
+ const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
71
+
72
+ #define DST_CHAN 1
73
+ #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
74
+ #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
75
+
76
+ // These constants are endian-agnostic
77
+ static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
78
+ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
79
+ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
80
+ static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
81
+ static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
82
+ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
83
+ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
84
+ static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
85
+ static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
86
+ static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
87
+ #ifndef __VSX__
88
+ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
89
+ #endif
90
+
91
+ static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
92
+ static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
93
+ static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
94
+ static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
95
+
96
+ static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
97
+ 8, 9, 10, 11, 12, 13, 14, 15};
98
+ static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
99
+ 8, 9, 10, 11, 12, 13, 14, 15};
100
+
101
+ static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
102
+ static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
103
+ static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
104
+
105
+ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
106
+ static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
107
+ static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
108
+ static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
109
+ static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
110
+
111
+ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
112
+
113
+ // Handle endianness properly while loading constants
114
+ // Define global static constants:
115
+ #ifdef _BIG_ENDIAN
116
+ static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
117
+ #ifdef __VSX__
118
+ static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
119
+ #endif
120
+ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
121
+ static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
122
+ static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
123
+ #else
124
+ static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
125
+ static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
126
+ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
127
+ static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
128
+ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
129
+ #endif // _BIG_ENDIAN
130
+
131
+ static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
132
+ static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
133
+ static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
134
+ static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
135
+
136
+ static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
137
+
138
+ #ifdef _BIG_ENDIAN
139
+ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
140
+ #else
141
+ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
142
+ #endif // _BIG_ENDIAN
143
+
144
+ #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
145
+ #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
146
+ #else
147
+ #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
148
+ #endif
149
+
150
+ template <>
151
+ struct packet_traits<float> : default_packet_traits {
152
+ typedef Packet4f type;
153
+ typedef Packet4f half;
154
+ enum {
155
+ Vectorizable = 1,
156
+ AlignedOnScalar = 1,
157
+ size = 4,
158
+ HasHalfPacket = 1,
159
+
160
+ HasAdd = 1,
161
+ HasSub = 1,
162
+ HasMul = 1,
163
+ HasDiv = 1,
164
+ HasMin = 1,
165
+ HasMax = 1,
166
+ HasAbs = 1,
167
+ HasSin = EIGEN_FAST_MATH,
168
+ HasCos = EIGEN_FAST_MATH,
169
+ HasLog = 1,
170
+ HasExp = 1,
171
+ #ifdef __VSX__
172
+ HasSqrt = 1,
173
+ #if !EIGEN_COMP_CLANG
174
+ HasRsqrt = 1,
175
+ #else
176
+ HasRsqrt = 0,
177
+ #endif
178
+ #else
179
+ HasSqrt = 0,
180
+ HasRsqrt = 0,
181
+ HasTanh = EIGEN_FAST_MATH,
182
+ HasErf = EIGEN_FAST_MATH,
183
+ #endif
184
+ HasRound = 1,
185
+ HasFloor = 1,
186
+ HasCeil = 1,
187
+ HasRint = 1,
188
+ HasNegate = 1,
189
+ HasBlend = 1
190
+ };
191
+ };
192
+ template <>
193
+ struct packet_traits<bfloat16> : default_packet_traits {
194
+ typedef Packet8bf type;
195
+ typedef Packet8bf half;
196
+ enum {
197
+ Vectorizable = 1,
198
+ AlignedOnScalar = 1,
199
+ size = 8,
200
+ HasHalfPacket = 0,
201
+
202
+ HasAdd = 1,
203
+ HasSub = 1,
204
+ HasMul = 1,
205
+ HasDiv = 1,
206
+ HasMin = 1,
207
+ HasMax = 1,
208
+ HasAbs = 1,
209
+ HasSin = EIGEN_FAST_MATH,
210
+ HasCos = EIGEN_FAST_MATH,
211
+ HasLog = 1,
212
+ HasExp = 1,
213
+ #ifdef __VSX__
214
+ HasSqrt = 1,
215
+ #if !EIGEN_COMP_CLANG
216
+ HasRsqrt = 1,
217
+ #else
218
+ HasRsqrt = 0,
219
+ #endif
220
+ #else
221
+ HasSqrt = 0,
222
+ HasRsqrt = 0,
223
+ HasTanh = EIGEN_FAST_MATH,
224
+ HasErf = EIGEN_FAST_MATH,
225
+ #endif
226
+ HasRound = 1,
227
+ HasFloor = 1,
228
+ HasCeil = 1,
229
+ HasRint = 1,
230
+ HasNegate = 1,
231
+ HasBlend = 1
232
+ };
233
+ };
234
+
235
+ template <>
236
+ struct packet_traits<int> : default_packet_traits {
237
+ typedef Packet4i type;
238
+ typedef Packet4i half;
239
+ enum {
240
+ Vectorizable = 1,
241
+ AlignedOnScalar = 1,
242
+ size = 4,
243
+ HasHalfPacket = 0,
244
+
245
+ HasAdd = 1,
246
+ HasSub = 1,
247
+ HasShift = 1,
248
+ HasMul = 1,
249
+ HasDiv = 0,
250
+ HasBlend = 1
251
+ };
252
+ };
253
+
254
+ template <>
255
+ struct packet_traits<short int> : default_packet_traits {
256
+ typedef Packet8s type;
257
+ typedef Packet8s half;
258
+ enum {
259
+ Vectorizable = 1,
260
+ AlignedOnScalar = 1,
261
+ size = 8,
262
+ HasHalfPacket = 0,
263
+
264
+ HasAdd = 1,
265
+ HasSub = 1,
266
+ HasMul = 1,
267
+ HasDiv = 0,
268
+ HasBlend = 1
269
+ };
270
+ };
271
+
272
+ template <>
273
+ struct packet_traits<unsigned short int> : default_packet_traits {
274
+ typedef Packet8us type;
275
+ typedef Packet8us half;
276
+ enum {
277
+ Vectorizable = 1,
278
+ AlignedOnScalar = 1,
279
+ size = 8,
280
+ HasHalfPacket = 0,
281
+
282
+ HasAdd = 1,
283
+ HasSub = 1,
284
+ HasMul = 1,
285
+ HasDiv = 0,
286
+ HasBlend = 1
287
+ };
288
+ };
289
+
290
+ template <>
291
+ struct packet_traits<signed char> : default_packet_traits {
292
+ typedef Packet16c type;
293
+ typedef Packet16c half;
294
+ enum {
295
+ Vectorizable = 1,
296
+ AlignedOnScalar = 1,
297
+ size = 16,
298
+ HasHalfPacket = 0,
299
+
300
+ HasAdd = 1,
301
+ HasSub = 1,
302
+ HasMul = 1,
303
+ HasDiv = 0,
304
+ HasBlend = 1
305
+ };
306
+ };
307
+
308
+ template <>
309
+ struct packet_traits<unsigned char> : default_packet_traits {
310
+ typedef Packet16uc type;
311
+ typedef Packet16uc half;
312
+ enum {
313
+ Vectorizable = 1,
314
+ AlignedOnScalar = 1,
315
+ size = 16,
316
+ HasHalfPacket = 0,
317
+
318
+ HasAdd = 1,
319
+ HasSub = 1,
320
+ HasMul = 1,
321
+ HasDiv = 0,
322
+ HasBlend = 1
323
+ };
324
+ };
325
+
326
+ template<> struct unpacket_traits<Packet4f>
327
+ {
328
+ typedef float type;
329
+ typedef Packet4f half;
330
+ typedef Packet4i integer_packet;
331
+ enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
332
+ };
333
+ template<> struct unpacket_traits<Packet4i>
334
+ {
335
+ typedef int type;
336
+ typedef Packet4i half;
337
+ enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
338
+ };
339
+ template<> struct unpacket_traits<Packet8s>
340
+ {
341
+ typedef short int type;
342
+ typedef Packet8s half;
343
+ enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
344
+ };
345
+ template<> struct unpacket_traits<Packet8us>
346
+ {
347
+ typedef unsigned short int type;
348
+ typedef Packet8us half;
349
+ enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
350
+ };
351
+
352
+ template<> struct unpacket_traits<Packet16c>
353
+ {
354
+ typedef signed char type;
355
+ typedef Packet16c half;
356
+ enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
357
+ };
358
+ template<> struct unpacket_traits<Packet16uc>
359
+ {
360
+ typedef unsigned char type;
361
+ typedef Packet16uc half;
362
+ enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
363
+ };
364
+
365
+ template<> struct unpacket_traits<Packet8bf>
366
+ {
367
+ typedef bfloat16 type;
368
+ typedef Packet8bf half;
369
+ enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
370
+ };
371
+ inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
372
+ {
373
+ union {
374
+ Packet16c v;
375
+ signed char n[16];
376
+ } vt;
377
+ vt.v = v;
378
+ for (int i=0; i< 16; i++)
379
+ s << vt.n[i] << ", ";
380
+ return s;
381
+ }
382
+
383
+ inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
384
+ {
385
+ union {
386
+ Packet16uc v;
387
+ unsigned char n[16];
388
+ } vt;
389
+ vt.v = v;
390
+ for (int i=0; i< 16; i++)
391
+ s << vt.n[i] << ", ";
392
+ return s;
393
+ }
394
+
395
+ inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
396
+ {
397
+ union {
398
+ Packet4f v;
399
+ float n[4];
400
+ } vt;
401
+ vt.v = v;
402
+ s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
403
+ return s;
404
+ }
405
+
406
+ inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
407
+ {
408
+ union {
409
+ Packet4i v;
410
+ int n[4];
411
+ } vt;
412
+ vt.v = v;
413
+ s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
414
+ return s;
415
+ }
416
+
417
+ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
418
+ {
419
+ union {
420
+ Packet4ui v;
421
+ unsigned int n[4];
422
+ } vt;
423
+ vt.v = v;
424
+ s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
425
+ return s;
426
+ }
427
+
428
+ template <typename Packet>
429
+ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
430
+ {
431
+ // some versions of GCC throw "unused-but-set-parameter".
432
+ // ignoring these warnings for now.
433
+ EIGEN_UNUSED_VARIABLE(from);
434
+ EIGEN_DEBUG_ALIGNED_LOAD
435
+ #ifdef __VSX__
436
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
437
+ #else
438
+ return vec_ld(0, from);
439
+ #endif
440
+ }
441
+
442
+ // Need to define them first or we get specialization after instantiation errors
443
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
444
+ {
445
+ return pload_common<Packet4f>(from);
446
+ }
447
+
448
+ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
449
+ {
450
+ return pload_common<Packet4i>(from);
451
+ }
452
+
453
+ template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
454
+ {
455
+ return pload_common<Packet8s>(from);
456
+ }
457
+
458
+ template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
459
+ {
460
+ return pload_common<Packet8us>(from);
461
+ }
462
+
463
+ template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from)
464
+ {
465
+ return pload_common<Packet16c>(from);
466
+ }
467
+
468
+ template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from)
469
+ {
470
+ return pload_common<Packet16uc>(from);
471
+ }
472
+
473
+ template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from)
474
+ {
475
+ return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
476
+ }
477
+
478
+ template <typename Packet>
479
+ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
480
+ // some versions of GCC throw "unused-but-set-parameter" (float *to).
481
+ // ignoring these warnings for now.
482
+ EIGEN_UNUSED_VARIABLE(to);
483
+ EIGEN_DEBUG_ALIGNED_STORE
484
+ #ifdef __VSX__
485
+ vec_xst(from, 0, to);
486
+ #else
487
+ vec_st(from, 0, to);
488
+ #endif
489
+ }
490
+
491
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
492
+ {
493
+ pstore_common<Packet4f>(to, from);
494
+ }
495
+
496
+ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
497
+ {
498
+ pstore_common<Packet4i>(to, from);
499
+ }
500
+
501
+ template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from)
502
+ {
503
+ pstore_common<Packet8s>(to, from);
504
+ }
505
+
506
+ template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from)
507
+ {
508
+ pstore_common<Packet8us>(to, from);
509
+ }
510
+
511
+ template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from)
512
+ {
513
+ pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
514
+ }
515
+
516
+ template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from)
517
+ {
518
+ pstore_common<Packet16c>(to, from);
519
+ }
520
+
521
+ template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from)
522
+ {
523
+ pstore_common<Packet16uc>(to, from);
524
+ }
525
+
526
+ template<typename Packet>
527
+ EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
528
+ {
529
+ Packet v = {from, from, from, from};
530
+ return v;
531
+ }
532
+
533
+ template<typename Packet>
534
+ EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
535
+ {
536
+ Packet v = {from, from, from, from, from, from, from, from};
537
+ return v;
538
+ }
539
+
540
+ template<typename Packet>
541
+ EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
542
+ {
543
+ Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
544
+ return v;
545
+ }
546
+
547
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
548
+ return pset1_size4<Packet4f>(from);
549
+ }
550
+
551
+ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
552
+ return pset1_size4<Packet4i>(from);
553
+ }
554
+
555
+ template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
556
+ return pset1_size8<Packet8s>(from);
557
+ }
558
+
559
+ template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
560
+ return pset1_size8<Packet8us>(from);
561
+ }
562
+
563
+ template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
564
+ return pset1_size16<Packet16c>(from);
565
+ }
566
+
567
+ template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
568
+ return pset1_size16<Packet16uc>(from);
569
+ }
570
+
571
+ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
572
+ return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
573
+ }
574
+
575
+ template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
576
+ return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
577
+ }
578
+
579
+ template<typename Packet> EIGEN_STRONG_INLINE void
580
+ pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
581
+ Packet& a0, Packet& a1, Packet& a2, Packet& a3)
582
+ {
583
+ a3 = pload<Packet>(a);
584
+ a0 = vec_splat(a3, 0);
585
+ a1 = vec_splat(a3, 1);
586
+ a2 = vec_splat(a3, 2);
587
+ a3 = vec_splat(a3, 3);
588
+ }
589
+
590
+ template<> EIGEN_STRONG_INLINE void
591
+ pbroadcast4<Packet4f>(const float *a,
592
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
593
+ {
594
+ pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
595
+ }
596
+ template<> EIGEN_STRONG_INLINE void
597
+ pbroadcast4<Packet4i>(const int *a,
598
+ Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
599
+ {
600
+ pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
601
+ }
602
+
603
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
604
+ {
605
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
606
+ a[0] = from[0*stride];
607
+ a[1] = from[1*stride];
608
+ a[2] = from[2*stride];
609
+ a[3] = from[3*stride];
610
+ return pload<Packet>(a);
611
+ }
612
+
613
+ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
614
+ {
615
+ return pgather_common<Packet4f>(from, stride);
616
+ }
617
+
618
+ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
619
+ {
620
+ return pgather_common<Packet4i>(from, stride);
621
+ }
622
+
623
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
624
+ {
625
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
626
+ a[0] = from[0*stride];
627
+ a[1] = from[1*stride];
628
+ a[2] = from[2*stride];
629
+ a[3] = from[3*stride];
630
+ a[4] = from[4*stride];
631
+ a[5] = from[5*stride];
632
+ a[6] = from[6*stride];
633
+ a[7] = from[7*stride];
634
+ return pload<Packet>(a);
635
+ }
636
+
637
+ template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
638
+ {
639
+ return pgather_size8<Packet8s>(from, stride);
640
+ }
641
+
642
+ template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
643
+ {
644
+ return pgather_size8<Packet8us>(from, stride);
645
+ }
646
+
647
+ template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
648
+ {
649
+ return pgather_size8<Packet8bf>(from, stride);
650
+ }
651
+
652
+ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
653
+ {
654
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
655
+ a[0] = from[0*stride];
656
+ a[1] = from[1*stride];
657
+ a[2] = from[2*stride];
658
+ a[3] = from[3*stride];
659
+ a[4] = from[4*stride];
660
+ a[5] = from[5*stride];
661
+ a[6] = from[6*stride];
662
+ a[7] = from[7*stride];
663
+ a[8] = from[8*stride];
664
+ a[9] = from[9*stride];
665
+ a[10] = from[10*stride];
666
+ a[11] = from[11*stride];
667
+ a[12] = from[12*stride];
668
+ a[13] = from[13*stride];
669
+ a[14] = from[14*stride];
670
+ a[15] = from[15*stride];
671
+ return pload<Packet>(a);
672
+ }
673
+
674
+
675
+ template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
676
+ {
677
+ return pgather_size16<Packet16c>(from, stride);
678
+ }
679
+
680
+ template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
681
+ {
682
+ return pgather_size16<Packet16uc>(from, stride);
683
+ }
684
+
685
+ template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
686
+ {
687
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
688
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
689
+ to[0*stride] = a[0];
690
+ to[1*stride] = a[1];
691
+ to[2*stride] = a[2];
692
+ to[3*stride] = a[3];
693
+ }
694
+
695
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
696
+ {
697
+ pscatter_size4<Packet4f>(to, from, stride);
698
+ }
699
+
700
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
701
+ {
702
+ pscatter_size4<Packet4i>(to, from, stride);
703
+ }
704
+
705
+ template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
706
+ {
707
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
708
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
709
+ to[0*stride] = a[0];
710
+ to[1*stride] = a[1];
711
+ to[2*stride] = a[2];
712
+ to[3*stride] = a[3];
713
+ to[4*stride] = a[4];
714
+ to[5*stride] = a[5];
715
+ to[6*stride] = a[6];
716
+ to[7*stride] = a[7];
717
+ }
718
+
719
+
720
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
721
+ {
722
+ pscatter_size8<Packet8s>(to, from, stride);
723
+ }
724
+
725
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
726
+ {
727
+ pscatter_size8<Packet8us>(to, from, stride);
728
+ }
729
+
730
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
731
+ {
732
+ pscatter_size8<Packet8bf>(to, from, stride);
733
+ }
734
+
735
+ template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
736
+ {
737
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
738
+ pstore<__UNPACK_TYPE__(Packet)>(a, from);
739
+ to[0*stride] = a[0];
740
+ to[1*stride] = a[1];
741
+ to[2*stride] = a[2];
742
+ to[3*stride] = a[3];
743
+ to[4*stride] = a[4];
744
+ to[5*stride] = a[5];
745
+ to[6*stride] = a[6];
746
+ to[7*stride] = a[7];
747
+ to[8*stride] = a[8];
748
+ to[9*stride] = a[9];
749
+ to[10*stride] = a[10];
750
+ to[11*stride] = a[11];
751
+ to[12*stride] = a[12];
752
+ to[13*stride] = a[13];
753
+ to[14*stride] = a[14];
754
+ to[15*stride] = a[15];
755
+ }
756
+
757
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
758
+ {
759
+ pscatter_size16<Packet16c>(to, from, stride);
760
+ }
761
+
762
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
763
+ {
764
+ pscatter_size16<Packet16uc>(to, from, stride);
765
+ }
766
+
767
+ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
768
+ template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
769
+ template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
770
+ template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
771
+ template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
772
+ template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
773
+
774
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; }
775
+ template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; }
776
+ template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; }
777
+ template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; }
778
+ template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; }
779
+ template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; }
780
+ template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
781
+
782
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; }
783
+ template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; }
784
+ template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; }
785
+ template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; }
786
+ template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
787
+ template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
788
+
789
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
790
+ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
791
+
792
+ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
793
+ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
794
+
795
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
796
+ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; }
797
+ template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); }
798
+ template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); }
799
+ template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
800
+ template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
801
+
802
+
803
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
804
+ {
805
+ #ifndef __VSX__ // VSX actually provides a div instruction
806
+ Packet4f t, y_0, y_1;
807
+
808
+ // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
809
+ y_0 = vec_re(b);
810
+
811
+ // Do one Newton-Raphson iteration to get the needed accuracy
812
+ t = vec_nmsub(y_0, b, p4f_ONE);
813
+ y_1 = vec_madd(y_0, t, y_0);
814
+
815
+ return vec_madd(a, y_1, p4f_MZERO);
816
+ #else
817
+ return vec_div(a, b);
818
+ #endif
819
+ }
820
+
821
+ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
822
+ { eigen_assert(false && "packet integer division are not supported by AltiVec");
823
+ return pset1<Packet4i>(0);
824
+ }
825
+
826
+ // for some weird raisons, it has to be overloaded for packet of integers
827
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
828
+ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
829
+ template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
830
+ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
831
+
832
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
833
+ {
834
+ #ifdef __VSX__
835
+ // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
836
+ Packet4f ret;
837
+ __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
838
+ return ret;
839
+ #else
840
+ return vec_min(a, b);
841
+ #endif
842
+ }
843
+ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
844
+ template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
845
+ template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
846
+ template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
847
+ template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
848
+
849
+
850
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
851
+ {
852
+ #ifdef __VSX__
853
+ // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
854
+ Packet4f ret;
855
+ __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
856
+ return ret;
857
+ #else
858
+ return vec_max(a, b);
859
+ #endif
860
+ }
861
+ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
862
+ template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
863
+ template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
864
+ template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
865
+ template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
866
+
867
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
868
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
869
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
870
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
871
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
872
+ return vec_nor(c,c);
873
+ }
874
+
875
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
876
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
877
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
878
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
879
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
880
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
881
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
882
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
883
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
884
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
885
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
886
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
887
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
888
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
889
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
890
+
891
+ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
892
+ template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
893
+ template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
894
+ template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
895
+ template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
896
+ return pand<Packet8us>(a, b);
897
+ }
898
+
899
+
900
+ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
901
+ template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
902
+ template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
903
+ template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
904
+ template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
905
+ return por<Packet8us>(a, b);
906
+ }
907
+
908
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
909
+ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
910
+ template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
911
+ return pxor<Packet8us>(a, b);
912
+ }
913
+
914
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
915
+ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
916
+
917
+ template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
918
+ return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
919
+ }
920
+
921
+ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
922
+ {
923
+ Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
924
+ Packet4f res;
925
+
926
+ #ifdef __VSX__
927
+ __asm__("xvrspiz %x0, %x1\n\t"
928
+ : "=&wa" (res)
929
+ : "wa" (t));
930
+ #else
931
+ __asm__("vrfiz %0, %1\n\t"
932
+ : "=v" (res)
933
+ : "v" (t));
934
+ #endif
935
+
936
+ return res;
937
+ }
938
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
939
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
940
+ template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
941
+ {
942
+ Packet4f res;
943
+
944
+ __asm__("xvrspic %x0, %x1\n\t"
945
+ : "=&wa" (res)
946
+ : "wa" (a));
947
+
948
+ return res;
949
+ }
950
+
951
+ template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
952
+ {
953
+ EIGEN_DEBUG_ALIGNED_LOAD
954
+ #ifdef _BIG_ENDIAN
955
+ Packet16uc MSQ, LSQ;
956
+ Packet16uc mask;
957
+ MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
958
+ LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
959
+ mask = vec_lvsl(0, from); // create the permute mask
960
+ //TODO: Add static_cast here
961
+ return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
962
+ #else
963
+ EIGEN_DEBUG_UNALIGNED_LOAD
964
+ return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
965
+ #endif
966
+ }
967
+
968
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
969
+ {
970
+ return ploadu_common<Packet4f>(from);
971
+ }
972
+ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
973
+ {
974
+ return ploadu_common<Packet4i>(from);
975
+ }
976
+ template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
977
+ {
978
+ return ploadu_common<Packet8s>(from);
979
+ }
980
+ template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
981
+ {
982
+ return ploadu_common<Packet8us>(from);
983
+ }
984
+ template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
985
+ {
986
+ return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
987
+ }
988
+ template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
989
+ {
990
+ return ploadu_common<Packet16c>(from);
991
+ }
992
+ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
993
+ {
994
+ return ploadu_common<Packet16uc>(from);
995
+ }
996
+
997
+ template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
998
+ {
999
+ Packet p;
1000
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
1001
+ else p = ploadu<Packet>(from);
1002
+ return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1003
+ }
1004
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1005
+ {
1006
+ return ploaddup_common<Packet4f>(from);
1007
+ }
1008
+ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
1009
+ {
1010
+ return ploaddup_common<Packet4i>(from);
1011
+ }
1012
+
1013
+ template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from)
1014
+ {
1015
+ Packet8s p;
1016
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1017
+ else p = ploadu<Packet8s>(from);
1018
+ return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1019
+ }
1020
+
1021
+ template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
1022
+ {
1023
+ Packet8us p;
1024
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1025
+ else p = ploadu<Packet8us>(from);
1026
+ return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1027
+ }
1028
+
1029
+ template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
1030
+ {
1031
+ Packet8s p;
1032
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1033
+ else p = ploadu<Packet8s>(from);
1034
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1035
+ }
1036
+
1037
+ template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from)
1038
+ {
1039
+ Packet8us p;
1040
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1041
+ else p = ploadu<Packet8us>(from);
1042
+ return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1043
+ }
1044
+
1045
+ template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from)
1046
+ {
1047
+ return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1048
+ }
1049
+
1050
+ template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from)
1051
+ {
1052
+ Packet16c p;
1053
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
1054
+ else p = ploadu<Packet16c>(from);
1055
+ return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1056
+ }
1057
+
1058
+ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
1059
+ {
1060
+ Packet16uc p;
1061
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
1062
+ else p = ploadu<Packet16uc>(from);
1063
+ return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1064
+ }
1065
+
1066
+ template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
1067
+ {
1068
+ EIGEN_DEBUG_UNALIGNED_STORE
1069
+ #ifdef _BIG_ENDIAN
1070
+ // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1071
+ // Warning: not thread safe!
1072
+ Packet16uc MSQ, LSQ, edges;
1073
+ Packet16uc edgeAlign, align;
1074
+
1075
+ MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
1076
+ LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
1077
+ edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1078
+ edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
1079
+ align = vec_lvsr( 0, to ); // permute map to misalign data
1080
+ MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
1081
+ LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
1082
+ vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
1083
+ vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
1084
+ #else
1085
+ vec_xst(from, 0, to);
1086
+ #endif
1087
+ }
1088
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1089
+ {
1090
+ pstoreu_common<Packet4f>(to, from);
1091
+ }
1092
+ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
1093
+ {
1094
+ pstoreu_common<Packet4i>(to, from);
1095
+ }
1096
+ template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from)
1097
+ {
1098
+ pstoreu_common<Packet8s>(to, from);
1099
+ }
1100
+ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from)
1101
+ {
1102
+ pstoreu_common<Packet8us>(to, from);
1103
+ }
1104
+ template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from)
1105
+ {
1106
+ pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
1107
+ }
1108
+ template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from)
1109
+ {
1110
+ pstoreu_common<Packet16c>(to, from);
1111
+ }
1112
+ template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from)
1113
+ {
1114
+ pstoreu_common<Packet16uc>(to, from);
1115
+ }
1116
+
1117
+ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
1118
+ template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
1119
+
1120
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
1121
+ template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
1122
+
1123
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1124
+ EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1125
+ vec_ste(a, 0, &x);
1126
+ return x;
1127
+ }
1128
+
1129
+ template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1130
+ return pfirst_common<Packet8s>(a);
1131
+ }
1132
+
1133
+ template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1134
+ return pfirst_common<Packet8us>(a);
1135
+ }
1136
+
1137
+ template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
1138
+ {
1139
+ return pfirst_common<Packet16c>(a);
1140
+ }
1141
+
1142
+ template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
1143
+ {
1144
+ return pfirst_common<Packet16uc>(a);
1145
+ }
1146
+
1147
+ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
1148
+ {
1149
+ return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1150
+ }
1151
+ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
1152
+ {
1153
+ return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1154
+ }
1155
+ template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
1156
+ {
1157
+ return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1158
+ }
1159
+ template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
1160
+ {
1161
+ return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1162
+ }
1163
+ template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
1164
+ {
1165
+ return vec_perm(a, a, p16uc_REVERSE8);
1166
+ }
1167
+ template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
1168
+ {
1169
+ return vec_perm(a, a, p16uc_REVERSE8);
1170
+ }
1171
+ template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
1172
+ {
1173
+ return preverse<Packet8us>(a);
1174
+ }
1175
+
1176
+ template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
1177
+ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
1178
+ template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
1179
+ template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
1180
+ template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
1181
+ template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
1182
+ template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1183
+ _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
1184
+ return pand<Packet8us>(p8us_abs_mask, a);
1185
+ }
1186
+
1187
+ template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
1188
+ { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1189
+ template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
1190
+ { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1191
+ template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
1192
+ { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1193
+ template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
1194
+ {
1195
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1196
+ Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1197
+ return reinterpret_cast<Packet4f>(r);
1198
+ }
1199
+
1200
+ template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
1201
+ {
1202
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1203
+ Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1204
+ return reinterpret_cast<Packet4f>(r);
1205
+ }
1206
+
1207
+ template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
1208
+ {
1209
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1210
+ return vec_sr(a, p4ui_mask);
1211
+ }
1212
+
1213
+ template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
1214
+ {
1215
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1216
+ return vec_sl(a, p4ui_mask);
1217
+ }
1218
+
1219
+ template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
1220
+ {
1221
+ const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1222
+ return vec_sl(a, p8us_mask);
1223
+ }
1224
+ template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
1225
+ {
1226
+ const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1227
+ return vec_sr(a, p8us_mask);
1228
+ }
1229
+
1230
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
1231
+ return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1232
+ }
1233
+
1234
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
1235
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1236
+ return pand<Packet4f>(
1237
+ reinterpret_cast<Packet4f>(bf.m_val),
1238
+ reinterpret_cast<Packet4f>(p4ui_high_mask)
1239
+ );
1240
+ }
1241
+
1242
+ // Simple interleaving of bool masks, prevents true values from being
1243
+ // converted to NaNs.
1244
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1245
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1246
+ Packet4f bf_odd, bf_even;
1247
+ bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
1248
+ bf_even = plogical_shift_right<16>(even);
1249
+ return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1250
+ }
1251
+
1252
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
1253
+ Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
1254
+ Packet4ui lsb = plogical_shift_right<16>(input);
1255
+ lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
1256
+
1257
+ _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
1258
+ Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
1259
+ input = padd<Packet4ui>(input, rounding_bias);
1260
+
1261
+ //Test NaN and Subnormal - Begin
1262
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
1263
+ Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
1264
+
1265
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
1266
+ Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
1267
+
1268
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
1269
+ Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
1270
+ Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
1271
+
1272
+ Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
1273
+ Packet4ui nan_selector = pandnot<Packet4ui>(
1274
+ reinterpret_cast<Packet4ui>(is_max_exp),
1275
+ reinterpret_cast<Packet4ui>(is_mant_zero)
1276
+ );
1277
+
1278
+ Packet4ui subnormal_selector = pandnot<Packet4ui>(
1279
+ reinterpret_cast<Packet4ui>(is_zero_exp),
1280
+ reinterpret_cast<Packet4ui>(is_mant_zero)
1281
+ );
1282
+
1283
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
1284
+ input = vec_sel(input, p4ui_nan, nan_selector);
1285
+ input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
1286
+ //Test NaN and Subnormal - End
1287
+
1288
+ input = plogical_shift_right<16>(input);
1289
+ return reinterpret_cast<Packet8us>(input);
1290
+ }
1291
+
1292
+ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
1293
+ Packet4f bf_odd, bf_even;
1294
+ bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
1295
+ bf_odd = plogical_shift_left<16>(bf_odd);
1296
+ bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
1297
+ return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1298
+ }
1299
+ #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
1300
+ Packet4f a_even = Bf16ToF32Even(A);\
1301
+ Packet4f a_odd = Bf16ToF32Odd(A);\
1302
+ Packet4f op_even = OP(a_even);\
1303
+ Packet4f op_odd = OP(a_odd);\
1304
+ return F32ToBf16(op_even, op_odd);\
1305
+
1306
+ #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
1307
+ Packet4f a_even = Bf16ToF32Even(A);\
1308
+ Packet4f a_odd = Bf16ToF32Odd(A);\
1309
+ Packet4f b_even = Bf16ToF32Even(B);\
1310
+ Packet4f b_odd = Bf16ToF32Odd(B);\
1311
+ Packet4f op_even = OP(a_even, b_even);\
1312
+ Packet4f op_odd = OP(a_odd, b_odd);\
1313
+ return F32ToBf16(op_even, op_odd);\
1314
+
1315
+ #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
1316
+ Packet4f a_even = Bf16ToF32Even(A);\
1317
+ Packet4f a_odd = Bf16ToF32Odd(A);\
1318
+ Packet4f b_even = Bf16ToF32Even(B);\
1319
+ Packet4f b_odd = Bf16ToF32Odd(B);\
1320
+ Packet4f op_even = OP(a_even, b_even);\
1321
+ Packet4f op_odd = OP(a_odd, b_odd);\
1322
+ return F32ToBf16Bool(op_even, op_odd);\
1323
+
1324
+ template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1325
+ BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
1326
+ }
1327
+
1328
+ template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1329
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
1330
+ }
1331
+
1332
+ template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1333
+ BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
1334
+ }
1335
+
1336
+ template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
1337
+ BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
1338
+ }
1339
+
1340
+ template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1341
+ BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
1342
+ }
1343
+
1344
+ template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
1345
+ BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
1346
+ }
1347
+ template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
1348
+ BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
1349
+ }
1350
+ template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
1351
+ BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
1352
+ }
1353
+
1354
+ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1355
+ return pldexp_generic(a,exponent);
1356
+ }
1357
+ template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
1358
+ BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
1359
+ }
1360
+
1361
+ template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1362
+ return pfrexp_generic(a,exponent);
1363
+ }
1364
+ template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
1365
+ Packet4f a_even = Bf16ToF32Even(a);
1366
+ Packet4f a_odd = Bf16ToF32Odd(a);
1367
+ Packet4f e_even;
1368
+ Packet4f e_odd;
1369
+ Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
1370
+ Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
1371
+ e = F32ToBf16(e_even, e_odd);
1372
+ return F32ToBf16(op_even, op_odd);
1373
+ }
1374
+
1375
+ template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
1376
+ BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
1377
+ }
1378
+ template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
1379
+ BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
1380
+ }
1381
+ template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
1382
+ BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
1383
+ }
1384
+ template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
1385
+ BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
1386
+ }
1387
+ template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
1388
+ BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
1389
+ }
1390
+ template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
1391
+ BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
1392
+ }
1393
+ template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
1394
+ BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
1395
+ }
1396
+ template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
1397
+ Packet4f a_even = Bf16ToF32Even(a);
1398
+ Packet4f a_odd = Bf16ToF32Odd(a);
1399
+ Packet4f b_even = Bf16ToF32Even(b);
1400
+ Packet4f b_odd = Bf16ToF32Odd(b);
1401
+ Packet4f c_even = Bf16ToF32Even(c);
1402
+ Packet4f c_odd = Bf16ToF32Odd(c);
1403
+ Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
1404
+ Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
1405
+ return F32ToBf16(pmadd_even, pmadd_odd);
1406
+ }
1407
+
1408
+ template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1409
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
1410
+ }
1411
+
1412
+ template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1413
+ BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
1414
+ }
1415
+
1416
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
1417
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
1418
+ }
1419
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
1420
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
1421
+ }
1422
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
1423
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
1424
+ }
1425
+ template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
1426
+ BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
1427
+ }
1428
+
1429
+ template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
1430
+ return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
1431
+ }
1432
+
1433
+ template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from)
1434
+ {
1435
+ return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1436
+ }
1437
+
1438
+ template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
1439
+ bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
1440
+ bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
1441
+ return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
1442
+ }
1443
+
1444
+ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
1445
+ {
1446
+ Packet4f b, sum;
1447
+ b = vec_sld(a, a, 8);
1448
+ sum = a + b;
1449
+ b = vec_sld(sum, sum, 4);
1450
+ sum += b;
1451
+ return pfirst(sum);
1452
+ }
1453
+
1454
+ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1455
+ {
1456
+ Packet4i sum;
1457
+ sum = vec_sums(a, p4i_ZERO);
1458
+ #ifdef _BIG_ENDIAN
1459
+ sum = vec_sld(sum, p4i_ZERO, 12);
1460
+ #else
1461
+ sum = vec_sld(p4i_ZERO, sum, 4);
1462
+ #endif
1463
+ return pfirst(sum);
1464
+ }
1465
+
1466
+ template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
1467
+ {
1468
+ float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
1469
+ float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
1470
+ float f32_result = redux_even + redux_odd;
1471
+ return bfloat16(f32_result);
1472
+ }
1473
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
1474
+ {
1475
+ union{
1476
+ Packet v;
1477
+ __UNPACK_TYPE__(Packet) n[8];
1478
+ } vt;
1479
+ vt.v = a;
1480
+
1481
+ EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1482
+ EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1483
+ Packet4i first_half = pload<Packet4i>(first_loader);
1484
+ Packet4i second_half = pload<Packet4i>(second_loader);
1485
+
1486
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
1487
+ }
1488
+
1489
+ template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
1490
+ {
1491
+ return predux_size8<Packet8s>(a);
1492
+ }
1493
+
1494
+ template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
1495
+ {
1496
+ return predux_size8<Packet8us>(a);
1497
+ }
1498
+
1499
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
1500
+ {
1501
+ union{
1502
+ Packet v;
1503
+ __UNPACK_TYPE__(Packet) n[16];
1504
+ } vt;
1505
+ vt.v = a;
1506
+
1507
+ EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1508
+ EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1509
+ EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
1510
+ EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
1511
+
1512
+ Packet4i first_quarter = pload<Packet4i>(first_loader);
1513
+ Packet4i second_quarter = pload<Packet4i>(second_loader);
1514
+ Packet4i third_quarter = pload<Packet4i>(third_loader);
1515
+ Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
1516
+
1517
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
1518
+ + predux(third_quarter) + predux(fourth_quarter));
1519
+ }
1520
+
1521
+ template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
1522
+ {
1523
+ return predux_size16<Packet16c>(a);
1524
+ }
1525
+
1526
+ template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
1527
+ {
1528
+ return predux_size16<Packet16uc>(a);
1529
+ }
1530
+
1531
+ // Other reduction functions:
1532
+ // mul
1533
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1534
+ {
1535
+ Packet4f prod;
1536
+ prod = pmul(a, vec_sld(a, a, 8));
1537
+ return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1538
+ }
1539
+
1540
+ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1541
+ {
1542
+ EIGEN_ALIGN16 int aux[4];
1543
+ pstore(aux, a);
1544
+ return aux[0] * aux[1] * aux[2] * aux[3];
1545
+ }
1546
+
1547
+ template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
1548
+ {
1549
+ Packet8s pair, quad, octo;
1550
+
1551
+ pair = vec_mul(a, vec_sld(a, a, 8));
1552
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1553
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1554
+
1555
+ return pfirst(octo);
1556
+ }
1557
+
1558
+ template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
1559
+ {
1560
+ Packet8us pair, quad, octo;
1561
+
1562
+ pair = vec_mul(a, vec_sld(a, a, 8));
1563
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1564
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1565
+
1566
+ return pfirst(octo);
1567
+ }
1568
+
1569
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
1570
+ {
1571
+ float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
1572
+ float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
1573
+ float f32_result = redux_even * redux_odd;
1574
+ return bfloat16(f32_result);
1575
+ }
1576
+
1577
+
1578
+ template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
1579
+ {
1580
+ Packet16c pair, quad, octo, result;
1581
+
1582
+ pair = vec_mul(a, vec_sld(a, a, 8));
1583
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1584
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1585
+ result = vec_mul(octo, vec_sld(octo, octo, 1));
1586
+
1587
+ return pfirst(result);
1588
+ }
1589
+
1590
+ template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
1591
+ {
1592
+ Packet16uc pair, quad, octo, result;
1593
+
1594
+ pair = vec_mul(a, vec_sld(a, a, 8));
1595
+ quad = vec_mul(pair, vec_sld(pair, pair, 4));
1596
+ octo = vec_mul(quad, vec_sld(quad, quad, 2));
1597
+ result = vec_mul(octo, vec_sld(octo, octo, 1));
1598
+
1599
+ return pfirst(result);
1600
+ }
1601
+
1602
+ // min
1603
+ template<typename Packet> EIGEN_STRONG_INLINE
1604
+ __UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
1605
+ {
1606
+ Packet b, res;
1607
+ b = vec_min(a, vec_sld(a, a, 8));
1608
+ res = vec_min(b, vec_sld(b, b, 4));
1609
+ return pfirst(res);
1610
+ }
1611
+
1612
+
1613
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1614
+ {
1615
+ return predux_min4<Packet4f>(a);
1616
+ }
1617
+
1618
+ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
1619
+ {
1620
+ return predux_min4<Packet4i>(a);
1621
+ }
1622
+
1623
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
1624
+ {
1625
+ float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
1626
+ float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
1627
+ float f32_result = (std::min)(redux_even, redux_odd);
1628
+ return bfloat16(f32_result);
1629
+ }
1630
+
1631
+ template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
1632
+ {
1633
+ Packet8s pair, quad, octo;
1634
+
1635
+ //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1636
+ pair = vec_min(a, vec_sld(a, a, 8));
1637
+
1638
+ //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1639
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1640
+
1641
+ //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1642
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1643
+ return pfirst(octo);
1644
+ }
1645
+
1646
+ template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
1647
+ {
1648
+ Packet8us pair, quad, octo;
1649
+
1650
+ //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1651
+ pair = vec_min(a, vec_sld(a, a, 8));
1652
+
1653
+ //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1654
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1655
+
1656
+ //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1657
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1658
+ return pfirst(octo);
1659
+ }
1660
+
1661
+ template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
1662
+ {
1663
+ Packet16c pair, quad, octo, result;
1664
+
1665
+ pair = vec_min(a, vec_sld(a, a, 8));
1666
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1667
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1668
+ result = vec_min(octo, vec_sld(octo, octo, 1));
1669
+
1670
+ return pfirst(result);
1671
+ }
1672
+
1673
+ template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
1674
+ {
1675
+ Packet16uc pair, quad, octo, result;
1676
+
1677
+ pair = vec_min(a, vec_sld(a, a, 8));
1678
+ quad = vec_min(pair, vec_sld(pair, pair, 4));
1679
+ octo = vec_min(quad, vec_sld(quad, quad, 2));
1680
+ result = vec_min(octo, vec_sld(octo, octo, 1));
1681
+
1682
+ return pfirst(result);
1683
+ }
1684
+ // max
1685
+ template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
1686
+ {
1687
+ Packet b, res;
1688
+ b = vec_max(a, vec_sld(a, a, 8));
1689
+ res = vec_max(b, vec_sld(b, b, 4));
1690
+ return pfirst(res);
1691
+ }
1692
+
1693
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1694
+ {
1695
+ return predux_max4<Packet4f>(a);
1696
+ }
1697
+
1698
+ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
1699
+ {
1700
+ return predux_max4<Packet4i>(a);
1701
+ }
1702
+
1703
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
1704
+ {
1705
+ float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
1706
+ float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
1707
+ float f32_result = (std::max)(redux_even, redux_odd);
1708
+ return bfloat16(f32_result);
1709
+ }
1710
+
1711
+ template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
1712
+ {
1713
+ Packet8s pair, quad, octo;
1714
+
1715
+ //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1716
+ pair = vec_max(a, vec_sld(a, a, 8));
1717
+
1718
+ //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1719
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1720
+
1721
+ //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1722
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1723
+ return pfirst(octo);
1724
+ }
1725
+
1726
+ template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
1727
+ {
1728
+ Packet8us pair, quad, octo;
1729
+
1730
+ //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1731
+ pair = vec_max(a, vec_sld(a, a, 8));
1732
+
1733
+ //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1734
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1735
+
1736
+ //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1737
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1738
+ return pfirst(octo);
1739
+ }
1740
+
1741
+ template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
1742
+ {
1743
+ Packet16c pair, quad, octo, result;
1744
+
1745
+ pair = vec_max(a, vec_sld(a, a, 8));
1746
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1747
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1748
+ result = vec_max(octo, vec_sld(octo, octo, 1));
1749
+
1750
+ return pfirst(result);
1751
+ }
1752
+
1753
+ template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
1754
+ {
1755
+ Packet16uc pair, quad, octo, result;
1756
+
1757
+ pair = vec_max(a, vec_sld(a, a, 8));
1758
+ quad = vec_max(pair, vec_sld(pair, pair, 4));
1759
+ octo = vec_max(quad, vec_sld(quad, quad, 2));
1760
+ result = vec_max(octo, vec_sld(octo, octo, 1));
1761
+
1762
+ return pfirst(result);
1763
+ }
1764
+
1765
+ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
1766
+ {
1767
+ return vec_any_ne(x, pzero(x));
1768
+ }
1769
+
1770
+ template <typename T> EIGEN_DEVICE_FUNC inline void
1771
+ ptranpose_common(PacketBlock<T,4>& kernel){
1772
+ T t0, t1, t2, t3;
1773
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1774
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1775
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1776
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1777
+ kernel.packet[0] = vec_mergeh(t0, t2);
1778
+ kernel.packet[1] = vec_mergel(t0, t2);
1779
+ kernel.packet[2] = vec_mergeh(t1, t3);
1780
+ kernel.packet[3] = vec_mergel(t1, t3);
1781
+ }
1782
+
1783
+ EIGEN_DEVICE_FUNC inline void
1784
+ ptranspose(PacketBlock<Packet4f,4>& kernel) {
1785
+ ptranpose_common<Packet4f>(kernel);
1786
+ }
1787
+
1788
+ EIGEN_DEVICE_FUNC inline void
1789
+ ptranspose(PacketBlock<Packet4i,4>& kernel) {
1790
+ ptranpose_common<Packet4i>(kernel);
1791
+ }
1792
+
1793
+ EIGEN_DEVICE_FUNC inline void
1794
+ ptranspose(PacketBlock<Packet8s,4>& kernel) {
1795
+ Packet8s t0, t1, t2, t3;
1796
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1797
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1798
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1799
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1800
+ kernel.packet[0] = vec_mergeh(t0, t2);
1801
+ kernel.packet[1] = vec_mergel(t0, t2);
1802
+ kernel.packet[2] = vec_mergeh(t1, t3);
1803
+ kernel.packet[3] = vec_mergel(t1, t3);
1804
+ }
1805
+
1806
+ EIGEN_DEVICE_FUNC inline void
1807
+ ptranspose(PacketBlock<Packet8us,4>& kernel) {
1808
+ Packet8us t0, t1, t2, t3;
1809
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1810
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1811
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1812
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1813
+ kernel.packet[0] = vec_mergeh(t0, t2);
1814
+ kernel.packet[1] = vec_mergel(t0, t2);
1815
+ kernel.packet[2] = vec_mergeh(t1, t3);
1816
+ kernel.packet[3] = vec_mergel(t1, t3);
1817
+ }
1818
+
1819
+
1820
+ EIGEN_DEVICE_FUNC inline void
1821
+ ptranspose(PacketBlock<Packet8bf,4>& kernel) {
1822
+ Packet8us t0, t1, t2, t3;
1823
+
1824
+ t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
1825
+ t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
1826
+ t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
1827
+ t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
1828
+ kernel.packet[0] = vec_mergeh(t0, t2);
1829
+ kernel.packet[1] = vec_mergel(t0, t2);
1830
+ kernel.packet[2] = vec_mergeh(t1, t3);
1831
+ kernel.packet[3] = vec_mergel(t1, t3);
1832
+ }
1833
+
1834
+ EIGEN_DEVICE_FUNC inline void
1835
+ ptranspose(PacketBlock<Packet16c,4>& kernel) {
1836
+ Packet16c t0, t1, t2, t3;
1837
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1838
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1839
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1840
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1841
+ kernel.packet[0] = vec_mergeh(t0, t2);
1842
+ kernel.packet[1] = vec_mergel(t0, t2);
1843
+ kernel.packet[2] = vec_mergeh(t1, t3);
1844
+ kernel.packet[3] = vec_mergel(t1, t3);
1845
+ }
1846
+
1847
+
1848
+ EIGEN_DEVICE_FUNC inline void
1849
+ ptranspose(PacketBlock<Packet16uc,4>& kernel) {
1850
+ Packet16uc t0, t1, t2, t3;
1851
+ t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1852
+ t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1853
+ t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1854
+ t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1855
+ kernel.packet[0] = vec_mergeh(t0, t2);
1856
+ kernel.packet[1] = vec_mergel(t0, t2);
1857
+ kernel.packet[2] = vec_mergeh(t1, t3);
1858
+ kernel.packet[3] = vec_mergel(t1, t3);
1859
+ }
1860
+
1861
+ EIGEN_DEVICE_FUNC inline void
1862
+ ptranspose(PacketBlock<Packet8s,8>& kernel) {
1863
+ Packet8s v[8], sum[8];
1864
+
1865
+ v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1866
+ v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1867
+ v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1868
+ v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1869
+ v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1870
+ v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1871
+ v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1872
+ v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1873
+ sum[0] = vec_mergeh(v[0], v[4]);
1874
+ sum[1] = vec_mergel(v[0], v[4]);
1875
+ sum[2] = vec_mergeh(v[1], v[5]);
1876
+ sum[3] = vec_mergel(v[1], v[5]);
1877
+ sum[4] = vec_mergeh(v[2], v[6]);
1878
+ sum[5] = vec_mergel(v[2], v[6]);
1879
+ sum[6] = vec_mergeh(v[3], v[7]);
1880
+ sum[7] = vec_mergel(v[3], v[7]);
1881
+
1882
+ kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1883
+ kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1884
+ kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1885
+ kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1886
+ kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1887
+ kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1888
+ kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1889
+ kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1890
+ }
1891
+
1892
+ EIGEN_DEVICE_FUNC inline void
1893
+ ptranspose(PacketBlock<Packet8us,8>& kernel) {
1894
+ Packet8us v[8], sum[8];
1895
+
1896
+ v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1897
+ v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1898
+ v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1899
+ v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1900
+ v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1901
+ v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1902
+ v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1903
+ v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1904
+ sum[0] = vec_mergeh(v[0], v[4]);
1905
+ sum[1] = vec_mergel(v[0], v[4]);
1906
+ sum[2] = vec_mergeh(v[1], v[5]);
1907
+ sum[3] = vec_mergel(v[1], v[5]);
1908
+ sum[4] = vec_mergeh(v[2], v[6]);
1909
+ sum[5] = vec_mergel(v[2], v[6]);
1910
+ sum[6] = vec_mergeh(v[3], v[7]);
1911
+ sum[7] = vec_mergel(v[3], v[7]);
1912
+
1913
+ kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1914
+ kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1915
+ kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1916
+ kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1917
+ kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1918
+ kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1919
+ kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1920
+ kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1921
+ }
1922
+
1923
+ EIGEN_DEVICE_FUNC inline void
1924
+ ptranspose(PacketBlock<Packet8bf,8>& kernel) {
1925
+ Packet8bf v[8], sum[8];
1926
+
1927
+ v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
1928
+ v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
1929
+ v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
1930
+ v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
1931
+ v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
1932
+ v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
1933
+ v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
1934
+ v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
1935
+ sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
1936
+ sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
1937
+ sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
1938
+ sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
1939
+ sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
1940
+ sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
1941
+ sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
1942
+ sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
1943
+
1944
+ kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
1945
+ kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
1946
+ kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
1947
+ kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
1948
+ kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
1949
+ kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
1950
+ kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
1951
+ kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
1952
+ }
1953
+
1954
+ EIGEN_DEVICE_FUNC inline void
1955
+ ptranspose(PacketBlock<Packet16c,16>& kernel) {
1956
+ Packet16c step1[16], step2[16], step3[16];
1957
+
1958
+ step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
1959
+ step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
1960
+ step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
1961
+ step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
1962
+ step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
1963
+ step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
1964
+ step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
1965
+ step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
1966
+ step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
1967
+ step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
1968
+ step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
1969
+ step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
1970
+ step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
1971
+ step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
1972
+ step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
1973
+ step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
1974
+
1975
+ step2[0] = vec_mergeh(step1[0], step1[8]);
1976
+ step2[1] = vec_mergel(step1[0], step1[8]);
1977
+ step2[2] = vec_mergeh(step1[1], step1[9]);
1978
+ step2[3] = vec_mergel(step1[1], step1[9]);
1979
+ step2[4] = vec_mergeh(step1[2], step1[10]);
1980
+ step2[5] = vec_mergel(step1[2], step1[10]);
1981
+ step2[6] = vec_mergeh(step1[3], step1[11]);
1982
+ step2[7] = vec_mergel(step1[3], step1[11]);
1983
+ step2[8] = vec_mergeh(step1[4], step1[12]);
1984
+ step2[9] = vec_mergel(step1[4], step1[12]);
1985
+ step2[10] = vec_mergeh(step1[5], step1[13]);
1986
+ step2[11] = vec_mergel(step1[5], step1[13]);
1987
+ step2[12] = vec_mergeh(step1[6], step1[14]);
1988
+ step2[13] = vec_mergel(step1[6], step1[14]);
1989
+ step2[14] = vec_mergeh(step1[7], step1[15]);
1990
+ step2[15] = vec_mergel(step1[7], step1[15]);
1991
+
1992
+ step3[0] = vec_mergeh(step2[0], step2[8]);
1993
+ step3[1] = vec_mergel(step2[0], step2[8]);
1994
+ step3[2] = vec_mergeh(step2[1], step2[9]);
1995
+ step3[3] = vec_mergel(step2[1], step2[9]);
1996
+ step3[4] = vec_mergeh(step2[2], step2[10]);
1997
+ step3[5] = vec_mergel(step2[2], step2[10]);
1998
+ step3[6] = vec_mergeh(step2[3], step2[11]);
1999
+ step3[7] = vec_mergel(step2[3], step2[11]);
2000
+ step3[8] = vec_mergeh(step2[4], step2[12]);
2001
+ step3[9] = vec_mergel(step2[4], step2[12]);
2002
+ step3[10] = vec_mergeh(step2[5], step2[13]);
2003
+ step3[11] = vec_mergel(step2[5], step2[13]);
2004
+ step3[12] = vec_mergeh(step2[6], step2[14]);
2005
+ step3[13] = vec_mergel(step2[6], step2[14]);
2006
+ step3[14] = vec_mergeh(step2[7], step2[15]);
2007
+ step3[15] = vec_mergel(step2[7], step2[15]);
2008
+
2009
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2010
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2011
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2012
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2013
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2014
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2015
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2016
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2017
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2018
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2019
+ kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2020
+ kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2021
+ kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2022
+ kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2023
+ kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2024
+ kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2025
+ }
2026
+
2027
+ EIGEN_DEVICE_FUNC inline void
2028
+ ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2029
+ Packet16uc step1[16], step2[16], step3[16];
2030
+
2031
+ step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2032
+ step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2033
+ step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2034
+ step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2035
+ step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2036
+ step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2037
+ step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2038
+ step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2039
+ step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2040
+ step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2041
+ step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2042
+ step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2043
+ step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2044
+ step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2045
+ step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2046
+ step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2047
+
2048
+ step2[0] = vec_mergeh(step1[0], step1[8]);
2049
+ step2[1] = vec_mergel(step1[0], step1[8]);
2050
+ step2[2] = vec_mergeh(step1[1], step1[9]);
2051
+ step2[3] = vec_mergel(step1[1], step1[9]);
2052
+ step2[4] = vec_mergeh(step1[2], step1[10]);
2053
+ step2[5] = vec_mergel(step1[2], step1[10]);
2054
+ step2[6] = vec_mergeh(step1[3], step1[11]);
2055
+ step2[7] = vec_mergel(step1[3], step1[11]);
2056
+ step2[8] = vec_mergeh(step1[4], step1[12]);
2057
+ step2[9] = vec_mergel(step1[4], step1[12]);
2058
+ step2[10] = vec_mergeh(step1[5], step1[13]);
2059
+ step2[11] = vec_mergel(step1[5], step1[13]);
2060
+ step2[12] = vec_mergeh(step1[6], step1[14]);
2061
+ step2[13] = vec_mergel(step1[6], step1[14]);
2062
+ step2[14] = vec_mergeh(step1[7], step1[15]);
2063
+ step2[15] = vec_mergel(step1[7], step1[15]);
2064
+
2065
+ step3[0] = vec_mergeh(step2[0], step2[8]);
2066
+ step3[1] = vec_mergel(step2[0], step2[8]);
2067
+ step3[2] = vec_mergeh(step2[1], step2[9]);
2068
+ step3[3] = vec_mergel(step2[1], step2[9]);
2069
+ step3[4] = vec_mergeh(step2[2], step2[10]);
2070
+ step3[5] = vec_mergel(step2[2], step2[10]);
2071
+ step3[6] = vec_mergeh(step2[3], step2[11]);
2072
+ step3[7] = vec_mergel(step2[3], step2[11]);
2073
+ step3[8] = vec_mergeh(step2[4], step2[12]);
2074
+ step3[9] = vec_mergel(step2[4], step2[12]);
2075
+ step3[10] = vec_mergeh(step2[5], step2[13]);
2076
+ step3[11] = vec_mergel(step2[5], step2[13]);
2077
+ step3[12] = vec_mergeh(step2[6], step2[14]);
2078
+ step3[13] = vec_mergel(step2[6], step2[14]);
2079
+ step3[14] = vec_mergeh(step2[7], step2[15]);
2080
+ step3[15] = vec_mergel(step2[7], step2[15]);
2081
+
2082
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2083
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2084
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2085
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2086
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2087
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2088
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2089
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2090
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2091
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2092
+ kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2093
+ kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2094
+ kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2095
+ kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2096
+ kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2097
+ kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2098
+ }
2099
+
2100
+ template<typename Packet> EIGEN_STRONG_INLINE
2101
+ Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
2102
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
2103
+ Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
2104
+ return vec_sel(elsePacket, thenPacket, mask);
2105
+ }
2106
+
2107
+ template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
2108
+ return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
2109
+ }
2110
+
2111
+ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
2112
+ return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
2113
+ }
2114
+
2115
+ template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
2116
+ Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2117
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2118
+ Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
2119
+ Packet8s result = vec_sel(elsePacket, thenPacket, mask);
2120
+ return result;
2121
+ }
2122
+
2123
+ template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
2124
+ Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2125
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2126
+ Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
2127
+ return vec_sel(elsePacket, thenPacket, mask);
2128
+ }
2129
+
2130
+ template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
2131
+ return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
2132
+ }
2133
+
2134
+ template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
2135
+ Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2136
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2137
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2138
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2139
+
2140
+ Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
2141
+ return vec_sel(elsePacket, thenPacket, mask);
2142
+ }
2143
+
2144
+ template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
2145
+ Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2146
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2147
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2148
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2149
+
2150
+ Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
2151
+ return vec_sel(elsePacket, thenPacket, mask);
2152
+ }
2153
+
2154
+ template <>
2155
+ struct type_casting_traits<float, int> {
2156
+ enum {
2157
+ VectorizedCast = 1,
2158
+ SrcCoeffRatio = 1,
2159
+ TgtCoeffRatio = 1
2160
+ };
2161
+ };
2162
+
2163
+ template <>
2164
+ struct type_casting_traits<int, float> {
2165
+ enum {
2166
+ VectorizedCast = 1,
2167
+ SrcCoeffRatio = 1,
2168
+ TgtCoeffRatio = 1
2169
+ };
2170
+ };
2171
+
2172
+ template <>
2173
+ struct type_casting_traits<bfloat16, unsigned short int> {
2174
+ enum {
2175
+ VectorizedCast = 1,
2176
+ SrcCoeffRatio = 1,
2177
+ TgtCoeffRatio = 1
2178
+ };
2179
+ };
2180
+
2181
+ template <>
2182
+ struct type_casting_traits<unsigned short int, bfloat16> {
2183
+ enum {
2184
+ VectorizedCast = 1,
2185
+ SrcCoeffRatio = 1,
2186
+ TgtCoeffRatio = 1
2187
+ };
2188
+ };
2189
+
2190
+ template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
2191
+ return vec_cts(a,0);
2192
+ }
2193
+
2194
+ template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
2195
+ return vec_ctu(a,0);
2196
+ }
2197
+
2198
+ template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
2199
+ return vec_ctf(a,0);
2200
+ }
2201
+
2202
+ template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
2203
+ return vec_ctf(a,0);
2204
+ }
2205
+
2206
+ template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
2207
+ Packet4f float_even = Bf16ToF32Even(a);
2208
+ Packet4f float_odd = Bf16ToF32Odd(a);
2209
+ Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
2210
+ Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
2211
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2212
+ Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
2213
+ Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
2214
+
2215
+ //Check values that are bigger than USHRT_MAX (0xFFFF)
2216
+ Packet4bi overflow_selector;
2217
+ if(vec_any_gt(int_even, p4ui_low_mask)){
2218
+ overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
2219
+ low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2220
+ }
2221
+ if(vec_any_gt(int_odd, p4ui_low_mask)){
2222
+ overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
2223
+ low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2224
+ }
2225
+
2226
+ low_odd = plogical_shift_left<16>(low_odd);
2227
+
2228
+ Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
2229
+ return reinterpret_cast<Packet8us>(int_final);
2230
+ }
2231
+
2232
+ template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
2233
+ //short -> int -> float -> bfloat16
2234
+ const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2235
+ Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
2236
+ Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
2237
+ Packet4ui int_odd = plogical_shift_right<16>(int_cast);
2238
+ Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
2239
+ Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
2240
+ return F32ToBf16(float_even, float_odd);
2241
+ }
2242
+
2243
+
2244
+ template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
2245
+ return reinterpret_cast<Packet4i>(a);
2246
+ }
2247
+
2248
+ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
2249
+ return reinterpret_cast<Packet4f>(a);
2250
+ }
2251
+
2252
+
2253
+
2254
+ //---------- double ----------
2255
+ #ifdef __VSX__
2256
+ typedef __vector double Packet2d;
2257
+ typedef __vector unsigned long long Packet2ul;
2258
+ typedef __vector long long Packet2l;
2259
+ #if EIGEN_COMP_CLANG
2260
+ typedef Packet2ul Packet2bl;
2261
+ #else
2262
+ typedef __vector __bool long Packet2bl;
2263
+ #endif
2264
+
2265
+ static Packet2l p2l_ONE = { 1, 1 };
2266
+ static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
2267
+ static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
2268
+ static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
2269
+ static Packet2d p2d_ONE = { 1.0, 1.0 };
2270
+ static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
2271
+ static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
2272
+ numext::bit_cast<double>(0x8000000000000000ull) };
2273
+
2274
+ #ifdef _BIG_ENDIAN
2275
+ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
2276
+ #else
2277
+ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
2278
+ #endif
2279
+
2280
+ template<int index> Packet2d vec_splat_dbl(Packet2d& a)
2281
+ {
2282
+ return vec_splat(a, index);
2283
+ }
2284
+
2285
+ template<> struct packet_traits<double> : default_packet_traits
2286
+ {
2287
+ typedef Packet2d type;
2288
+ typedef Packet2d half;
2289
+ enum {
2290
+ Vectorizable = 1,
2291
+ AlignedOnScalar = 1,
2292
+ size=2,
2293
+ HasHalfPacket = 1,
2294
+
2295
+ HasAdd = 1,
2296
+ HasSub = 1,
2297
+ HasMul = 1,
2298
+ HasDiv = 1,
2299
+ HasMin = 1,
2300
+ HasMax = 1,
2301
+ HasAbs = 1,
2302
+ HasSin = 0,
2303
+ HasCos = 0,
2304
+ HasLog = 0,
2305
+ HasExp = 1,
2306
+ HasSqrt = 1,
2307
+ HasRsqrt = 1,
2308
+ HasRound = 1,
2309
+ HasFloor = 1,
2310
+ HasCeil = 1,
2311
+ HasRint = 1,
2312
+ HasNegate = 1,
2313
+ HasBlend = 1
2314
+ };
2315
+ };
2316
+
2317
+ template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
2318
+
2319
+ inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
2320
+ {
2321
+ union {
2322
+ Packet2l v;
2323
+ int64_t n[2];
2324
+ } vt;
2325
+ vt.v = v;
2326
+ s << vt.n[0] << ", " << vt.n[1];
2327
+ return s;
2328
+ }
2329
+
2330
+ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
2331
+ {
2332
+ union {
2333
+ Packet2d v;
2334
+ double n[2];
2335
+ } vt;
2336
+ vt.v = v;
2337
+ s << vt.n[0] << ", " << vt.n[1];
2338
+ return s;
2339
+ }
2340
+
2341
+ // Need to define them first or we get specialization after instantiation errors
2342
+ template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
2343
+ {
2344
+ EIGEN_DEBUG_ALIGNED_LOAD
2345
+ return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
2346
+ }
2347
+
2348
+ template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
2349
+ {
2350
+ EIGEN_DEBUG_ALIGNED_STORE
2351
+ vec_xst(from, 0, to);
2352
+ }
2353
+
2354
+ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
2355
+ Packet2d v = {from, from};
2356
+ return v;
2357
+ }
2358
+
2359
+ template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
2360
+ Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
2361
+ return reinterpret_cast<Packet2d>(v);
2362
+ }
2363
+
2364
+ template<> EIGEN_STRONG_INLINE void
2365
+ pbroadcast4<Packet2d>(const double *a,
2366
+ Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
2367
+ {
2368
+ //This way is faster than vec_splat (at least for doubles in Power 9)
2369
+ a0 = pset1<Packet2d>(a[0]);
2370
+ a1 = pset1<Packet2d>(a[1]);
2371
+ a2 = pset1<Packet2d>(a[2]);
2372
+ a3 = pset1<Packet2d>(a[3]);
2373
+ }
2374
+
2375
+ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
2376
+ {
2377
+ EIGEN_ALIGN16 double af[2];
2378
+ af[0] = from[0*stride];
2379
+ af[1] = from[1*stride];
2380
+ return pload<Packet2d>(af);
2381
+ }
2382
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
2383
+ {
2384
+ EIGEN_ALIGN16 double af[2];
2385
+ pstore<double>(af, from);
2386
+ to[0*stride] = af[0];
2387
+ to[1*stride] = af[1];
2388
+ }
2389
+
2390
+ template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
2391
+
2392
+ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
2393
+
2394
+ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
2395
+
2396
+ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
2397
+
2398
+ template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
2399
+
2400
+ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
2401
+ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
2402
+
2403
+ // for some weird raisons, it has to be overloaded for packet of integers
2404
+ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
2405
+
2406
+ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
2407
+ {
2408
+ // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
2409
+ Packet2d ret;
2410
+ __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
2411
+ return ret;
2412
+ }
2413
+
2414
+ template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
2415
+ {
2416
+ // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
2417
+ Packet2d ret;
2418
+ __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
2419
+ return ret;
2420
+ }
2421
+
2422
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
2423
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
2424
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
2425
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
2426
+ Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
2427
+ return vec_nor(c,c);
2428
+ }
2429
+
2430
+ template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
2431
+
2432
+ template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
2433
+
2434
+ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
2435
+
2436
+ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
2437
+
2438
+ template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
2439
+ {
2440
+ Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
2441
+ Packet2d res;
2442
+
2443
+ __asm__("xvrdpiz %x0, %x1\n\t"
2444
+ : "=&wa" (res)
2445
+ : "wa" (t));
2446
+
2447
+ return res;
2448
+ }
2449
+ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
2450
+ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
2451
+ template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
2452
+ {
2453
+ Packet2d res;
2454
+
2455
+ __asm__("xvrdpic %x0, %x1\n\t"
2456
+ : "=&wa" (res)
2457
+ : "wa" (a));
2458
+
2459
+ return res;
2460
+ }
2461
+
2462
+ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
2463
+ {
2464
+ EIGEN_DEBUG_UNALIGNED_LOAD
2465
+ return vec_xl(0, const_cast<double*>(from));
2466
+ }
2467
+
2468
+ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
2469
+ {
2470
+ Packet2d p;
2471
+ if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
2472
+ else p = ploadu<Packet2d>(from);
2473
+ return vec_splat_dbl<0>(p);
2474
+ }
2475
+
2476
+ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
2477
+ {
2478
+ EIGEN_DEBUG_UNALIGNED_STORE
2479
+ vec_xst(from, 0, to);
2480
+ }
2481
+
2482
+ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
2483
+
2484
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
2485
+
2486
+ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
2487
+ {
2488
+ return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
2489
+ }
2490
+ template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
2491
+
2492
+ // VSX support varies between different compilers and even different
2493
+ // versions of the same compiler. For gcc version >= 4.9.3, we can use
2494
+ // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
2495
+ // a slow version that works with older compilers.
2496
+ // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
2497
+ // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
2498
+ template<>
2499
+ inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
2500
+ #if EIGEN_GNUC_AT_LEAST(5, 4) || \
2501
+ (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
2502
+ return vec_cts(x, 0); // TODO: check clang version.
2503
+ #else
2504
+ double tmp[2];
2505
+ memcpy(tmp, &x, sizeof(tmp));
2506
+ Packet2l l = { static_cast<long long>(tmp[0]),
2507
+ static_cast<long long>(tmp[1]) };
2508
+ return l;
2509
+ #endif
2510
+ }
2511
+
2512
+ template<>
2513
+ inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
2514
+ unsigned long long tmp[2];
2515
+ memcpy(tmp, &x, sizeof(tmp));
2516
+ Packet2d d = { static_cast<double>(tmp[0]),
2517
+ static_cast<double>(tmp[1]) };
2518
+ return d;
2519
+ }
2520
+
2521
+
2522
+ // Packet2l shifts.
2523
+ // For POWER8 we simply use vec_sr/l.
2524
+ //
2525
+ // Things are more complicated for POWER7. There is actually a
2526
+ // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
2527
+ // So we need to shift by N % 32 and rearrage bytes.
2528
+ #ifdef __POWER8_VECTOR__
2529
+
2530
+ template<int N>
2531
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2532
+ const Packet2ul shift = { N, N };
2533
+ return vec_sl(a, shift);
2534
+ }
2535
+
2536
+ template<int N>
2537
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2538
+ const Packet2ul shift = { N, N };
2539
+ return vec_sr(a, shift);
2540
+ }
2541
+
2542
+ #else
2543
+
2544
+ // Shifts [A, B, C, D] to [B, 0, D, 0].
2545
+ // Used to implement left shifts for Packet2l.
2546
+ EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
2547
+ static const Packet16uc perm = {
2548
+ 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
2549
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
2550
+ #ifdef _BIG_ENDIAN
2551
+ return vec_perm(p4i_ZERO, a, perm);
2552
+ #else
2553
+ return vec_perm(a, p4i_ZERO, perm);
2554
+ #endif
2555
+ }
2556
+
2557
+ // Shifts [A, B, C, D] to [0, A, 0, C].
2558
+ // Used to implement right shifts for Packet2l.
2559
+ EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
2560
+ static const Packet16uc perm = {
2561
+ 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
2562
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
2563
+ #ifdef _BIG_ENDIAN
2564
+ return vec_perm(p4i_ZERO, a, perm);
2565
+ #else
2566
+ return vec_perm(a, p4i_ZERO, perm);
2567
+ #endif
2568
+ }
2569
+
2570
+ template<int N, typename EnableIf = void>
2571
+ struct plogical_shift_left_impl;
2572
+
2573
+ template<int N>
2574
+ struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
2575
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2576
+ static const unsigned n = static_cast<unsigned>(N);
2577
+ const Packet4ui shift = {n, n, n, n};
2578
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2579
+ static const unsigned m = static_cast<unsigned>(32 - N);
2580
+ const Packet4ui shift_right = {m, m, m, m};
2581
+ const Packet4i out_hi = vec_sl(ai, shift);
2582
+ const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
2583
+ return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
2584
+ }
2585
+ };
2586
+
2587
+ template<int N>
2588
+ struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
2589
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2590
+ static const unsigned m = static_cast<unsigned>(N - 32);
2591
+ const Packet4ui shift = {m, m, m, m};
2592
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2593
+ return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
2594
+ }
2595
+ };
2596
+
2597
+ template<int N>
2598
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2599
+ return plogical_shift_left_impl<N>::run(a);
2600
+ }
2601
+
2602
+ template<int N, typename EnableIf = void>
2603
+ struct plogical_shift_right_impl;
2604
+
2605
+ template<int N>
2606
+ struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
2607
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2608
+ static const unsigned n = static_cast<unsigned>(N);
2609
+ const Packet4ui shift = {n, n, n, n};
2610
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2611
+ static const unsigned m = static_cast<unsigned>(32 - N);
2612
+ const Packet4ui shift_left = {m, m, m, m};
2613
+ const Packet4i out_lo = vec_sr(ai, shift);
2614
+ const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
2615
+ return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
2616
+ }
2617
+ };
2618
+
2619
+ template<int N>
2620
+ struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
2621
+ static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2622
+ static const unsigned m = static_cast<unsigned>(N - 32);
2623
+ const Packet4ui shift = {m, m, m, m};
2624
+ const Packet4i ai = reinterpret_cast<Packet4i>(a);
2625
+ return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
2626
+ }
2627
+ };
2628
+
2629
+ template<int N>
2630
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2631
+ return plogical_shift_right_impl<N>::run(a);
2632
+ }
2633
+ #endif
2634
+
2635
+ template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2636
+ // Clamp exponent to [-2099, 2099]
2637
+ const Packet2d max_exponent = pset1<Packet2d>(2099.0);
2638
+ const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
2639
+
2640
+ // Split 2^e into four factors and multiply:
2641
+ const Packet2l bias = { 1023, 1023 };
2642
+ Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
2643
+ Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
2644
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
2645
+ b = psub(psub(psub(e, b), b), b); // e - 3b
2646
+ c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
2647
+ out = pmul(out, c); // a * 2^e
2648
+ return out;
2649
+ }
2650
+
2651
+
2652
+ // Extract exponent without existence of Packet2l.
2653
+ template<>
2654
+ EIGEN_STRONG_INLINE
2655
+ Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
2656
+ return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
2657
+ }
2658
+
2659
+ template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
2660
+ return pfrexp_generic(a, exponent);
2661
+ }
2662
+
2663
+ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
2664
+ {
2665
+ Packet2d b, sum;
2666
+ b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
2667
+ sum = a + b;
2668
+ return pfirst<Packet2d>(sum);
2669
+ }
2670
+
2671
+ // Other reduction functions:
2672
+ // mul
2673
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
2674
+ {
2675
+ return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2676
+ }
2677
+
2678
+ // min
2679
+ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
2680
+ {
2681
+ return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2682
+ }
2683
+
2684
+ // max
2685
+ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
2686
+ {
2687
+ return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2688
+ }
2689
+
2690
+ EIGEN_DEVICE_FUNC inline void
2691
+ ptranspose(PacketBlock<Packet2d,2>& kernel) {
2692
+ Packet2d t0, t1;
2693
+ t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
2694
+ t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
2695
+ kernel.packet[0] = t0;
2696
+ kernel.packet[1] = t1;
2697
+ }
2698
+
2699
+ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
2700
+ Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
2701
+ Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
2702
+ return vec_sel(elsePacket, thenPacket, mask);
2703
+ }
2704
+
2705
+
2706
+ #endif // __VSX__
2707
+ } // end namespace internal
2708
+
2709
+ } // end namespace Eigen
2710
+
2711
+ #endif // EIGEN_PACKET_MATH_ALTIVEC_H