tomoto 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,51 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_TYPE_CASTING_AVX_H
11
+ #define EIGEN_TYPE_CASTING_AVX_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ // For now we use SSE to handle integers, so we can't use AVX instructions to cast
18
+ // from int to float
19
+ template <>
20
+ struct type_casting_traits<float, int> {
21
+ enum {
22
+ VectorizedCast = 0,
23
+ SrcCoeffRatio = 1,
24
+ TgtCoeffRatio = 1
25
+ };
26
+ };
27
+
28
+ template <>
29
+ struct type_casting_traits<int, float> {
30
+ enum {
31
+ VectorizedCast = 0,
32
+ SrcCoeffRatio = 1,
33
+ TgtCoeffRatio = 1
34
+ };
35
+ };
36
+
37
+
38
+
39
+ template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
40
+ return _mm256_cvtps_epi32(a);
41
+ }
42
+
43
+ template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
44
+ return _mm256_cvtepi32_ps(a);
45
+ }
46
+
47
+ } // end namespace internal
48
+
49
+ } // end namespace Eigen
50
+
51
+ #endif // EIGEN_TYPE_CASTING_AVX_H
@@ -0,0 +1,391 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
11
+ #define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
18
+ #if EIGEN_GNUC_AT_LEAST(5, 3)
19
+
20
+ #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
21
+ const Packet16f p16f_##NAME = pset1<Packet16f>(X)
22
+
23
+ #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
24
+ const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
25
+
26
+ #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
27
+ const Packet8d p8d_##NAME = pset1<Packet8d>(X)
28
+
29
+ #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
30
+ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
31
+
32
+ // Natural logarithm
33
+ // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
34
+ // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
35
+ // be easily approximated by a polynomial centered on m=1 for stability.
36
+ #if defined(EIGEN_VECTORIZE_AVX512DQ)
37
+ template <>
38
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
39
+ plog<Packet16f>(const Packet16f& _x) {
40
+ Packet16f x = _x;
41
+ _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
42
+ _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
43
+ _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
44
+
45
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
46
+
47
+ // The smallest non denormalized float number.
48
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
49
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
50
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
51
+
52
+ // Polynomial coefficients.
53
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
54
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
55
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
56
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
57
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
58
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
59
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
60
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
61
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
62
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
63
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
64
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
65
+
66
+ // invalid_mask is set to true when x is NaN
67
+ __mmask16 invalid_mask =
68
+ _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
69
+ __mmask16 iszero_mask =
70
+ _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
71
+
72
+ // Truncate input values to the minimum positive normal.
73
+ x = pmax(x, p16f_min_norm_pos);
74
+
75
+ // Extract the shifted exponents.
76
+ Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
77
+ Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
78
+
79
+ // Set the exponents to -1, i.e. x are in the range [0.5,1).
80
+ x = _mm512_and_ps(x, p16f_inv_mant_mask);
81
+ x = _mm512_or_ps(x, p16f_half);
82
+
83
+ // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
84
+ // and shift by -1. The values are then centered around 0, which improves
85
+ // the stability of the polynomial evaluation.
86
+ // if( x < SQRTHF ) {
87
+ // e -= 1;
88
+ // x = x + x - 1.0;
89
+ // } else { x = x - 1.0; }
90
+ __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
91
+ Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
92
+ x = psub(x, p16f_1);
93
+ e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
94
+ x = padd(x, tmp);
95
+
96
+ Packet16f x2 = pmul(x, x);
97
+ Packet16f x3 = pmul(x2, x);
98
+
99
+ // Evaluate the polynomial approximant of degree 8 in three parts, probably
100
+ // to improve instruction-level parallelism.
101
+ Packet16f y, y1, y2;
102
+ y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
103
+ y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
104
+ y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
105
+ y = pmadd(y, x, p16f_cephes_log_p2);
106
+ y1 = pmadd(y1, x, p16f_cephes_log_p5);
107
+ y2 = pmadd(y2, x, p16f_cephes_log_p8);
108
+ y = pmadd(y, x3, y1);
109
+ y = pmadd(y, x3, y2);
110
+ y = pmul(y, x3);
111
+
112
+ // Add the logarithm of the exponent back to the result of the interpolation.
113
+ y1 = pmul(e, p16f_cephes_log_q1);
114
+ tmp = pmul(x2, p16f_half);
115
+ y = padd(y, y1);
116
+ x = psub(x, tmp);
117
+ y2 = pmul(e, p16f_cephes_log_q2);
118
+ x = padd(x, y);
119
+ x = padd(x, y2);
120
+
121
+ // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
122
+ return _mm512_mask_blend_ps(iszero_mask,
123
+ _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
124
+ p16f_minus_inf);
125
+ }
126
+ #endif
127
+
128
+ // Exponential function. Works by writing "x = m*log(2) + r" where
129
+ // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
130
+ // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
131
+ template <>
132
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
133
+ pexp<Packet16f>(const Packet16f& _x) {
134
+ _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
135
+ _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
136
+ _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
137
+
138
+ _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
139
+ _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
140
+
141
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
142
+
143
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
144
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
145
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
146
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
147
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
148
+ _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
149
+
150
+ // Clamp x.
151
+ Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
152
+
153
+ // Express exp(x) as exp(m*ln(2) + r), start by extracting
154
+ // m = floor(x/ln(2) + 0.5).
155
+ Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
156
+
157
+ // Get r = x - m*ln(2). Note that we can do this without losing more than one
158
+ // ulp precision due to the FMA instruction.
159
+ _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
160
+ Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
161
+ Packet16f r2 = pmul(r, r);
162
+
163
+ // TODO(gonnet): Split into odd/even polynomials and try to exploit
164
+ // instruction-level parallelism.
165
+ Packet16f y = p16f_cephes_exp_p0;
166
+ y = pmadd(y, r, p16f_cephes_exp_p1);
167
+ y = pmadd(y, r, p16f_cephes_exp_p2);
168
+ y = pmadd(y, r, p16f_cephes_exp_p3);
169
+ y = pmadd(y, r, p16f_cephes_exp_p4);
170
+ y = pmadd(y, r, p16f_cephes_exp_p5);
171
+ y = pmadd(y, r2, r);
172
+ y = padd(y, p16f_1);
173
+
174
+ // Build emm0 = 2^m.
175
+ Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
176
+ emm0 = _mm512_slli_epi32(emm0, 23);
177
+
178
+ // Return 2^m * exp(r).
179
+ return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
180
+ }
181
+
182
+ /*template <>
183
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
184
+ pexp<Packet8d>(const Packet8d& _x) {
185
+ Packet8d x = _x;
186
+
187
+ _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
188
+ _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
189
+
190
+ _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
191
+ _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
192
+
193
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
194
+
195
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
196
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
197
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
198
+
199
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
200
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
201
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
202
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
203
+
204
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
205
+ _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
206
+
207
+ // clamp x
208
+ x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
209
+
210
+ // Express exp(x) as exp(g + n*log(2)).
211
+ const Packet8d n =
212
+ _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
213
+
214
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
215
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
216
+ // digits right.
217
+ const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
218
+ const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
219
+ x = psub(x, nC1);
220
+ x = psub(x, nC2);
221
+
222
+ const Packet8d x2 = pmul(x, x);
223
+
224
+ // Evaluate the numerator polynomial of the rational interpolant.
225
+ Packet8d px = p8d_cephes_exp_p0;
226
+ px = pmadd(px, x2, p8d_cephes_exp_p1);
227
+ px = pmadd(px, x2, p8d_cephes_exp_p2);
228
+ px = pmul(px, x);
229
+
230
+ // Evaluate the denominator polynomial of the rational interpolant.
231
+ Packet8d qx = p8d_cephes_exp_q0;
232
+ qx = pmadd(qx, x2, p8d_cephes_exp_q1);
233
+ qx = pmadd(qx, x2, p8d_cephes_exp_q2);
234
+ qx = pmadd(qx, x2, p8d_cephes_exp_q3);
235
+
236
+ // I don't really get this bit, copied from the SSE2 routines, so...
237
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
238
+ // rational interpolant?
239
+ x = _mm512_div_pd(px, psub(qx, px));
240
+ x = pmadd(p8d_2, x, p8d_1);
241
+
242
+ // Build e=2^n.
243
+ const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
244
+ _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
245
+
246
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
247
+ // non-finite values in the input.
248
+ return pmax(pmul(x, e), _x);
249
+ }*/
250
+
251
+ // Functions for sqrt.
252
+ // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
253
+ // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
254
+ // exact solution. The main advantage of this approach is not just speed, but
255
+ // also the fact that it can be inlined and pipelined with other computations,
256
+ // further reducing its effective latency.
257
+ #if EIGEN_FAST_MATH
258
+ template <>
259
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
260
+ psqrt<Packet16f>(const Packet16f& _x) {
261
+ _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
262
+ _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
263
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
264
+
265
+ Packet16f neg_half = pmul(_x, p16f_minus_half);
266
+
267
+ // select only the inverse sqrt of positive normal inputs (denormals are
268
+ // flushed to zero and cause infs as well).
269
+ __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
270
+ Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
271
+
272
+ // Do a single step of Newton's iteration.
273
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
274
+
275
+ // Multiply the original _x by it's reciprocal square root to extract the
276
+ // square root.
277
+ return pmul(_x, x);
278
+ }
279
+
280
+ template <>
281
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
282
+ psqrt<Packet8d>(const Packet8d& _x) {
283
+ _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
284
+ _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
285
+ _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
286
+
287
+ Packet8d neg_half = pmul(_x, p8d_minus_half);
288
+
289
+ // select only the inverse sqrt of positive normal inputs (denormals are
290
+ // flushed to zero and cause infs as well).
291
+ __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
292
+ Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
293
+
294
+ // Do a first step of Newton's iteration.
295
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
296
+
297
+ // Do a second step of Newton's iteration.
298
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
299
+
300
+ // Multiply the original _x by it's reciprocal square root to extract the
301
+ // square root.
302
+ return pmul(_x, x);
303
+ }
304
+ #else
305
+ template <>
306
+ EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
307
+ return _mm512_sqrt_ps(x);
308
+ }
309
+ template <>
310
+ EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
311
+ return _mm512_sqrt_pd(x);
312
+ }
313
+ #endif
314
+
315
+ // Functions for rsqrt.
316
+ // Almost identical to the sqrt routine, just leave out the last multiplication
317
+ // and fill in NaN/Inf where needed. Note that this function only exists as an
318
+ // iterative version for doubles since there is no instruction for diretly
319
+ // computing the reciprocal square root in AVX-512.
320
+ #ifdef EIGEN_FAST_MATH
321
+ template <>
322
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
323
+ prsqrt<Packet16f>(const Packet16f& _x) {
324
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
325
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
326
+ _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
327
+ _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
328
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
329
+
330
+ Packet16f neg_half = pmul(_x, p16f_minus_half);
331
+
332
+ // select only the inverse sqrt of positive normal inputs (denormals are
333
+ // flushed to zero and cause infs as well).
334
+ __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
335
+ Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
336
+
337
+ // Fill in NaNs and Infs for the negative/zero entries.
338
+ __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
339
+ Packet16f infs_and_nans = _mm512_mask_blend_ps(
340
+ neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
341
+
342
+ // Do a single step of Newton's iteration.
343
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
344
+
345
+ // Insert NaNs and Infs in all the right places.
346
+ return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
347
+ }
348
+
349
+ template <>
350
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
351
+ prsqrt<Packet8d>(const Packet8d& _x) {
352
+ _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
353
+ _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
354
+ _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
355
+ _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
356
+ _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
357
+
358
+ Packet8d neg_half = pmul(_x, p8d_minus_half);
359
+
360
+ // select only the inverse sqrt of positive normal inputs (denormals are
361
+ // flushed to zero and cause infs as well).
362
+ __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
363
+ Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
364
+
365
+ // Fill in NaNs and Infs for the negative/zero entries.
366
+ __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
367
+ Packet8d infs_and_nans = _mm512_mask_blend_pd(
368
+ neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
369
+
370
+ // Do a first step of Newton's iteration.
371
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
372
+
373
+ // Do a second step of Newton's iteration.
374
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
375
+
376
+ // Insert NaNs and Infs in all the right places.
377
+ return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
378
+ }
379
+ #elif defined(EIGEN_VECTORIZE_AVX512ER)
380
+ template <>
381
+ EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
382
+ return _mm512_rsqrt28_ps(x);
383
+ }
384
+ #endif
385
+ #endif
386
+
387
+ } // end namespace internal
388
+
389
+ } // end namespace Eigen
390
+
391
+ #endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
@@ -0,0 +1,1316 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_AVX512_H
11
+ #define EIGEN_PACKET_MATH_AVX512_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
19
+ #endif
20
+
21
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
22
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
23
+ #endif
24
+
25
+ #ifdef __FMA__
26
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28
+ #endif
29
+ #endif
30
+
31
+ typedef __m512 Packet16f;
32
+ typedef __m512i Packet16i;
33
+ typedef __m512d Packet8d;
34
+
35
+ template <>
36
+ struct is_arithmetic<__m512> {
37
+ enum { value = true };
38
+ };
39
+ template <>
40
+ struct is_arithmetic<__m512i> {
41
+ enum { value = true };
42
+ };
43
+ template <>
44
+ struct is_arithmetic<__m512d> {
45
+ enum { value = true };
46
+ };
47
+
48
+ template<> struct packet_traits<float> : default_packet_traits
49
+ {
50
+ typedef Packet16f type;
51
+ typedef Packet8f half;
52
+ enum {
53
+ Vectorizable = 1,
54
+ AlignedOnScalar = 1,
55
+ size = 16,
56
+ HasHalfPacket = 1,
57
+ #if EIGEN_GNUC_AT_LEAST(5, 3)
58
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
59
+ HasLog = 1,
60
+ #endif
61
+ HasExp = 1,
62
+ HasSqrt = 1,
63
+ HasRsqrt = 1,
64
+ #endif
65
+ HasDiv = 1
66
+ };
67
+ };
68
+ template<> struct packet_traits<double> : default_packet_traits
69
+ {
70
+ typedef Packet8d type;
71
+ typedef Packet4d half;
72
+ enum {
73
+ Vectorizable = 1,
74
+ AlignedOnScalar = 1,
75
+ size = 8,
76
+ HasHalfPacket = 1,
77
+ #if EIGEN_GNUC_AT_LEAST(5, 3)
78
+ HasSqrt = 1,
79
+ HasRsqrt = EIGEN_FAST_MATH,
80
+ #endif
81
+ HasDiv = 1
82
+ };
83
+ };
84
+
85
+ /* TODO Implement AVX512 for integers
86
+ template<> struct packet_traits<int> : default_packet_traits
87
+ {
88
+ typedef Packet16i type;
89
+ enum {
90
+ Vectorizable = 1,
91
+ AlignedOnScalar = 1,
92
+ size=8
93
+ };
94
+ };
95
+ */
96
+
97
+ template <>
98
+ struct unpacket_traits<Packet16f> {
99
+ typedef float type;
100
+ typedef Packet8f half;
101
+ enum { size = 16, alignment=Aligned64 };
102
+ };
103
+ template <>
104
+ struct unpacket_traits<Packet8d> {
105
+ typedef double type;
106
+ typedef Packet4d half;
107
+ enum { size = 8, alignment=Aligned64 };
108
+ };
109
+ template <>
110
+ struct unpacket_traits<Packet16i> {
111
+ typedef int type;
112
+ typedef Packet8i half;
113
+ enum { size = 16, alignment=Aligned64 };
114
+ };
115
+
116
+ template <>
117
+ EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
118
+ return _mm512_set1_ps(from);
119
+ }
120
+ template <>
121
+ EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) {
122
+ return _mm512_set1_pd(from);
123
+ }
124
+ template <>
125
+ EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
126
+ return _mm512_set1_epi32(from);
127
+ }
128
+
129
+ template <>
130
+ EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
131
+ return _mm512_broadcastss_ps(_mm_load_ps1(from));
132
+ }
133
+ template <>
134
+ EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
135
+ return _mm512_broadcastsd_pd(_mm_load_pd1(from));
136
+ }
137
+
138
+ template <>
139
+ EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
140
+ return _mm512_add_ps(
141
+ _mm512_set1_ps(a),
142
+ _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
143
+ 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
144
+ }
145
+ template <>
146
+ EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
147
+ return _mm512_add_pd(_mm512_set1_pd(a),
148
+ _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
149
+ }
150
+
151
+ template <>
152
+ EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
153
+ const Packet16f& b) {
154
+ return _mm512_add_ps(a, b);
155
+ }
156
+ template <>
157
+ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
158
+ const Packet8d& b) {
159
+ return _mm512_add_pd(a, b);
160
+ }
161
+
162
+ template <>
163
+ EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
164
+ const Packet16f& b) {
165
+ return _mm512_sub_ps(a, b);
166
+ }
167
+ template <>
168
+ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
169
+ const Packet8d& b) {
170
+ return _mm512_sub_pd(a, b);
171
+ }
172
+
173
+ template <>
174
+ EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
175
+ return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
176
+ }
177
+ template <>
178
+ EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
179
+ return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
180
+ }
181
+
182
+ template <>
183
+ EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
184
+ return a;
185
+ }
186
+ template <>
187
+ EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) {
188
+ return a;
189
+ }
190
+ template <>
191
+ EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
192
+ return a;
193
+ }
194
+
195
+ template <>
196
+ EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
197
+ const Packet16f& b) {
198
+ return _mm512_mul_ps(a, b);
199
+ }
200
+ template <>
201
+ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
202
+ const Packet8d& b) {
203
+ return _mm512_mul_pd(a, b);
204
+ }
205
+
206
+ template <>
207
+ EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
208
+ const Packet16f& b) {
209
+ return _mm512_div_ps(a, b);
210
+ }
211
+ template <>
212
+ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
213
+ const Packet8d& b) {
214
+ return _mm512_div_pd(a, b);
215
+ }
216
+
217
+ #ifdef __FMA__
218
+ template <>
219
+ EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
220
+ const Packet16f& c) {
221
+ return _mm512_fmadd_ps(a, b, c);
222
+ }
223
+ template <>
224
+ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
225
+ const Packet8d& c) {
226
+ return _mm512_fmadd_pd(a, b, c);
227
+ }
228
+ #endif
229
+
230
+ template <>
231
+ EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
232
+ const Packet16f& b) {
233
+ return _mm512_min_ps(a, b);
234
+ }
235
+ template <>
236
+ EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
237
+ const Packet8d& b) {
238
+ return _mm512_min_pd(a, b);
239
+ }
240
+
241
+ template <>
242
+ EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
243
+ const Packet16f& b) {
244
+ return _mm512_max_ps(a, b);
245
+ }
246
+ template <>
247
+ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
248
+ const Packet8d& b) {
249
+ return _mm512_max_pd(a, b);
250
+ }
251
+
252
+ template <>
253
+ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
254
+ const Packet16f& b) {
255
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
256
+ return _mm512_and_ps(a, b);
257
+ #else
258
+ Packet16f res = _mm512_undefined_ps();
259
+ Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
260
+ Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
261
+ res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
262
+
263
+ Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
264
+ Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
265
+ res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
266
+
267
+ Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
268
+ Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
269
+ res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
270
+
271
+ Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
272
+ Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
273
+ res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
274
+
275
+ return res;
276
+ #endif
277
+ }
278
+ template <>
279
+ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
280
+ const Packet8d& b) {
281
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
282
+ return _mm512_and_pd(a, b);
283
+ #else
284
+ Packet8d res = _mm512_undefined_pd();
285
+ Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
286
+ Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
287
+ res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
288
+
289
+ Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
290
+ Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
291
+ res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
292
+
293
+ return res;
294
+ #endif
295
+ }
296
+ template <>
297
+ EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
298
+ const Packet16f& b) {
299
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
300
+ return _mm512_or_ps(a, b);
301
+ #else
302
+ Packet16f res = _mm512_undefined_ps();
303
+ Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
304
+ Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
305
+ res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
306
+
307
+ Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
308
+ Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
309
+ res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
310
+
311
+ Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
312
+ Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
313
+ res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
314
+
315
+ Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
316
+ Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
317
+ res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
318
+
319
+ return res;
320
+ #endif
321
+ }
322
+
323
+ template <>
324
+ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
325
+ const Packet8d& b) {
326
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
327
+ return _mm512_or_pd(a, b);
328
+ #else
329
+ Packet8d res = _mm512_undefined_pd();
330
+ Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
331
+ Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
332
+ res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
333
+
334
+ Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
335
+ Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
336
+ res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
337
+
338
+ return res;
339
+ #endif
340
+ }
341
+
342
+ template <>
343
+ EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
344
+ const Packet16f& b) {
345
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
346
+ return _mm512_xor_ps(a, b);
347
+ #else
348
+ Packet16f res = _mm512_undefined_ps();
349
+ Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
350
+ Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
351
+ res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
352
+
353
+ Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
354
+ Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
355
+ res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
356
+
357
+ Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
358
+ Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
359
+ res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
360
+
361
+ Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
362
+ Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
363
+ res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
364
+
365
+ return res;
366
+ #endif
367
+ }
368
+ template <>
369
+ EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
370
+ const Packet8d& b) {
371
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
372
+ return _mm512_xor_pd(a, b);
373
+ #else
374
+ Packet8d res = _mm512_undefined_pd();
375
+ Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
376
+ Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
377
+ res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
378
+
379
+ Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
380
+ Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
381
+ res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
382
+
383
+ return res;
384
+ #endif
385
+ }
386
+
387
+ template <>
388
+ EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
389
+ const Packet16f& b) {
390
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
391
+ return _mm512_andnot_ps(a, b);
392
+ #else
393
+ Packet16f res = _mm512_undefined_ps();
394
+ Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
395
+ Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
396
+ res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
397
+
398
+ Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
399
+ Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
400
+ res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
401
+
402
+ Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
403
+ Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
404
+ res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
405
+
406
+ Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
407
+ Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
408
+ res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
409
+
410
+ return res;
411
+ #endif
412
+ }
413
+ template <>
414
+ EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
415
+ const Packet8d& b) {
416
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
417
+ return _mm512_andnot_pd(a, b);
418
+ #else
419
+ Packet8d res = _mm512_undefined_pd();
420
+ Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
421
+ Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
422
+ res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
423
+
424
+ Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
425
+ Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
426
+ res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
427
+
428
+ return res;
429
+ #endif
430
+ }
431
+
432
+ template <>
433
+ EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
434
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
435
+ }
436
+ template <>
437
+ EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
438
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from);
439
+ }
440
+ template <>
441
+ EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
442
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
443
+ reinterpret_cast<const __m512i*>(from));
444
+ }
445
+
446
+ template <>
447
+ EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
448
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from);
449
+ }
450
+ template <>
451
+ EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
452
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from);
453
+ }
454
+ template <>
455
+ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
456
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
457
+ reinterpret_cast<const __m512i*>(from));
458
+ }
459
+
460
+ // Loads 8 floats from memory a returns the packet
461
+ // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
462
+ template <>
463
+ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
464
+ Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
465
+ // mimic an "inplace" permutation of the lower 128bits using a blend
466
+ lane0 = _mm256_blend_ps(
467
+ lane0, _mm256_castps128_ps256(_mm_permute_ps(
468
+ _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
469
+ 15);
470
+ // then we can perform a consistent permutation on the global register to get
471
+ // everything in shape:
472
+ lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
473
+
474
+ Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
475
+ // mimic an "inplace" permutation of the lower 128bits using a blend
476
+ lane1 = _mm256_blend_ps(
477
+ lane1, _mm256_castps128_ps256(_mm_permute_ps(
478
+ _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
479
+ 15);
480
+ // then we can perform a consistent permutation on the global register to get
481
+ // everything in shape:
482
+ lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
483
+
484
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
485
+ Packet16f res = _mm512_undefined_ps();
486
+ return _mm512_insertf32x8(res, lane0, 0);
487
+ return _mm512_insertf32x8(res, lane1, 1);
488
+ return res;
489
+ #else
490
+ Packet16f res = _mm512_undefined_ps();
491
+ res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
492
+ res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
493
+ res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
494
+ res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
495
+ return res;
496
+ #endif
497
+ }
498
+ // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
499
+ // a3}
500
+ template <>
501
+ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
502
+ Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
503
+ lane0 = _mm256_permute_pd(lane0, 3 << 2);
504
+
505
+ Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
506
+ lane1 = _mm256_permute_pd(lane1, 3 << 2);
507
+
508
+ Packet8d res = _mm512_undefined_pd();
509
+ res = _mm512_insertf64x4(res, lane0, 0);
510
+ return _mm512_insertf64x4(res, lane1, 1);
511
+ }
512
+
513
+ // Loads 4 floats from memory a returns the packet
514
+ // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
515
+ template <>
516
+ EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
517
+ Packet16f tmp = _mm512_undefined_ps();
518
+ tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
519
+ tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
520
+ tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
521
+ tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
522
+ return tmp;
523
+ }
524
+ // Loads 2 doubles from memory a returns the packet
525
+ // {a0, a0 a0, a0, a1, a1, a1, a1}
526
+ template <>
527
+ EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
528
+ Packet8d tmp = _mm512_undefined_pd();
529
+ Packet2d tmp0 = _mm_load_pd1(from);
530
+ Packet2d tmp1 = _mm_load_pd1(from + 1);
531
+ Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
532
+ Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
533
+ tmp = _mm512_insertf64x4(tmp, lane0, 0);
534
+ return _mm512_insertf64x4(tmp, lane1, 1);
535
+ }
536
+
537
+ template <>
538
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
539
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
540
+ }
541
+ template <>
542
+ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
543
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from);
544
+ }
545
+ template <>
546
+ EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
547
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
548
+ from);
549
+ }
550
+
551
+ template <>
552
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
553
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
554
+ }
555
+ template <>
556
+ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
557
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from);
558
+ }
559
+ template <>
560
+ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
561
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
562
+ reinterpret_cast<__m512i*>(to), from);
563
+ }
564
+
565
+ template <>
566
+ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
567
+ Index stride) {
568
+ Packet16i stride_vector = _mm512_set1_epi32(stride);
569
+ Packet16i stride_multiplier =
570
+ _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
571
+ Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
572
+
573
+ return _mm512_i32gather_ps(indices, from, 4);
574
+ }
575
+ template <>
576
+ EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
577
+ Index stride) {
578
+ Packet8i stride_vector = _mm256_set1_epi32(stride);
579
+ Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
580
+ Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
581
+
582
+ return _mm512_i32gather_pd(indices, from, 8);
583
+ }
584
+
585
+ template <>
586
+ EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
587
+ const Packet16f& from,
588
+ Index stride) {
589
+ Packet16i stride_vector = _mm512_set1_epi32(stride);
590
+ Packet16i stride_multiplier =
591
+ _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
592
+ Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
593
+ _mm512_i32scatter_ps(to, indices, from, 4);
594
+ }
595
+ template <>
596
+ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
597
+ const Packet8d& from,
598
+ Index stride) {
599
+ Packet8i stride_vector = _mm256_set1_epi32(stride);
600
+ Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
601
+ Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
602
+ _mm512_i32scatter_pd(to, indices, from, 8);
603
+ }
604
+
605
+ template <>
606
+ EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
607
+ Packet16f pa = pset1<Packet16f>(a);
608
+ pstore(to, pa);
609
+ }
610
+ template <>
611
+ EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) {
612
+ Packet8d pa = pset1<Packet8d>(a);
613
+ pstore(to, pa);
614
+ }
615
+ template <>
616
+ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
617
+ Packet16i pa = pset1<Packet16i>(a);
618
+ pstore(to, pa);
619
+ }
620
+
621
+ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
622
+ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
623
+ template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
624
+
625
+ template <>
626
+ EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
627
+ return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
628
+ }
629
+ template <>
630
+ EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
631
+ return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
632
+ }
633
+ template <>
634
+ EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
635
+ return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
636
+ }
637
+
638
+ template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
639
+ {
640
+ return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
641
+ }
642
+
643
+ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
644
+ {
645
+ return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
646
+ }
647
+
648
+ template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
649
+ {
650
+ // _mm512_abs_ps intrinsic not found, so hack around it
651
+ return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
652
+ }
653
+ template <>
654
+ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
655
+ // _mm512_abs_ps intrinsic not found, so hack around it
656
+ return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
657
+ _mm512_set1_epi64(0x7fffffffffffffff)));
658
+ }
659
+
660
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
661
+ // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
662
+ #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
663
+ __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
664
+ _mm512_extractf32x8_ps(INPUT, 1)
665
+ #else
666
+ #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
667
+ __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
668
+ _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
669
+ _mm512_extractf32x4_ps(INPUT, 1), 1); \
670
+ __m256 OUTPUT##_1 = _mm256_insertf128_ps( \
671
+ _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
672
+ _mm512_extractf32x4_ps(INPUT, 3), 1);
673
+ #endif
674
+
675
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
676
+ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
677
+ OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
678
+ OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
679
+ #else
680
+ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
681
+ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
682
+ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
683
+ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
684
+ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
685
+ #endif
686
+ template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
687
+ vecs)
688
+ {
689
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
690
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
691
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2);
692
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3);
693
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4);
694
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5);
695
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6);
696
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7);
697
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8);
698
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9);
699
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10);
700
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11);
701
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12);
702
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13);
703
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14);
704
+ EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15);
705
+
706
+ __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0);
707
+ __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0);
708
+ __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0);
709
+ __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0);
710
+
711
+ __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
712
+ __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
713
+ __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
714
+ __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
715
+
716
+ __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
717
+ __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
718
+ __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
719
+ __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
720
+
721
+ __m256 sum1 = _mm256_add_ps(perm1, hsum5);
722
+ __m256 sum2 = _mm256_add_ps(perm2, hsum6);
723
+ __m256 sum3 = _mm256_add_ps(perm3, hsum7);
724
+ __m256 sum4 = _mm256_add_ps(perm4, hsum8);
725
+
726
+ __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
727
+ __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
728
+
729
+ __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
730
+
731
+ hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1);
732
+ hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1);
733
+ hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1);
734
+ hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1);
735
+
736
+ hsum5 = _mm256_hadd_ps(hsum1, hsum1);
737
+ hsum6 = _mm256_hadd_ps(hsum2, hsum2);
738
+ hsum7 = _mm256_hadd_ps(hsum3, hsum3);
739
+ hsum8 = _mm256_hadd_ps(hsum4, hsum4);
740
+
741
+ perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
742
+ perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
743
+ perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
744
+ perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
745
+
746
+ sum1 = _mm256_add_ps(perm1, hsum5);
747
+ sum2 = _mm256_add_ps(perm2, hsum6);
748
+ sum3 = _mm256_add_ps(perm3, hsum7);
749
+ sum4 = _mm256_add_ps(perm4, hsum8);
750
+
751
+ blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
752
+ blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
753
+
754
+ final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0));
755
+
756
+ hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0);
757
+ hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0);
758
+ hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0);
759
+ hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0);
760
+
761
+ hsum5 = _mm256_hadd_ps(hsum1, hsum1);
762
+ hsum6 = _mm256_hadd_ps(hsum2, hsum2);
763
+ hsum7 = _mm256_hadd_ps(hsum3, hsum3);
764
+ hsum8 = _mm256_hadd_ps(hsum4, hsum4);
765
+
766
+ perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
767
+ perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
768
+ perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
769
+ perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
770
+
771
+ sum1 = _mm256_add_ps(perm1, hsum5);
772
+ sum2 = _mm256_add_ps(perm2, hsum6);
773
+ sum3 = _mm256_add_ps(perm3, hsum7);
774
+ sum4 = _mm256_add_ps(perm4, hsum8);
775
+
776
+ blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
777
+ blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
778
+
779
+ __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0);
780
+
781
+ hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1);
782
+ hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1);
783
+ hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1);
784
+ hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1);
785
+
786
+ hsum5 = _mm256_hadd_ps(hsum1, hsum1);
787
+ hsum6 = _mm256_hadd_ps(hsum2, hsum2);
788
+ hsum7 = _mm256_hadd_ps(hsum3, hsum3);
789
+ hsum8 = _mm256_hadd_ps(hsum4, hsum4);
790
+
791
+ perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
792
+ perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
793
+ perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
794
+ perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
795
+
796
+ sum1 = _mm256_add_ps(perm1, hsum5);
797
+ sum2 = _mm256_add_ps(perm2, hsum6);
798
+ sum3 = _mm256_add_ps(perm3, hsum7);
799
+ sum4 = _mm256_add_ps(perm4, hsum8);
800
+
801
+ blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
802
+ blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
803
+
804
+ final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0));
805
+
806
+ __m512 final_output;
807
+
808
+ EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1);
809
+ return final_output;
810
+ }
811
+
812
+ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
813
+ {
814
+ Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0);
815
+ Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1);
816
+
817
+ Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0);
818
+ Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1);
819
+
820
+ Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0);
821
+ Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1);
822
+
823
+ Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0);
824
+ Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1);
825
+
826
+ Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0);
827
+ Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1);
828
+
829
+ Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0);
830
+ Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1);
831
+
832
+ Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0);
833
+ Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1);
834
+
835
+ Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0);
836
+ Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1);
837
+
838
+ Packet4d tmp0, tmp1;
839
+
840
+ tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0);
841
+ tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
842
+
843
+ tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0);
844
+ tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
845
+
846
+ __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC);
847
+
848
+ tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1);
849
+ tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
850
+
851
+ tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1);
852
+ tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
853
+
854
+ final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC));
855
+
856
+ tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0);
857
+ tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
858
+
859
+ tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0);
860
+ tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
861
+
862
+ __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC);
863
+
864
+ tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1);
865
+ tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
866
+
867
+ tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1);
868
+ tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
869
+
870
+ final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
871
+
872
+ __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
873
+
874
+ return _mm512_insertf64x4(final_output, final_1, 1);
875
+ }
876
+
877
+ template <>
878
+ EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
879
+ //#ifdef EIGEN_VECTORIZE_AVX512DQ
880
+ #if 0
881
+ Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
882
+ Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
883
+ Packet8f sum = padd(lane0, lane1);
884
+ Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
885
+ tmp0 = _mm256_hadd_ps(tmp0, tmp0);
886
+ return pfirst(_mm256_hadd_ps(tmp0, tmp0));
887
+ #else
888
+ Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
889
+ Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
890
+ Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
891
+ Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
892
+ Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
893
+ sum = _mm_hadd_ps(sum, sum);
894
+ sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
895
+ return pfirst(sum);
896
+ #endif
897
+ }
898
+ template <>
899
+ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
900
+ Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
901
+ Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
902
+ Packet4d sum = padd(lane0, lane1);
903
+ Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
904
+ return pfirst(_mm256_hadd_pd(tmp0, tmp0));
905
+ }
906
+
907
+ template <>
908
+ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
909
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
910
+ Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
911
+ Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
912
+ return padd(lane0, lane1);
913
+ #else
914
+ Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
915
+ Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
916
+ Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
917
+ Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
918
+ Packet4f sum0 = padd(lane0, lane2);
919
+ Packet4f sum1 = padd(lane1, lane3);
920
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
921
+ #endif
922
+ }
923
+ template <>
924
+ EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
925
+ Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
926
+ Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
927
+ Packet4d res = padd(lane0, lane1);
928
+ return res;
929
+ }
930
+
931
+ template <>
932
+ EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
933
+ //#ifdef EIGEN_VECTORIZE_AVX512DQ
934
+ #if 0
935
+ Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
936
+ Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
937
+ Packet8f res = pmul(lane0, lane1);
938
+ res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
939
+ res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
940
+ return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
941
+ #else
942
+ Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
943
+ Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
944
+ Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
945
+ Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
946
+ Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
947
+ res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
948
+ return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
949
+ #endif
950
+ }
951
+ template <>
952
+ EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
953
+ Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
954
+ Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
955
+ Packet4d res = pmul(lane0, lane1);
956
+ res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
957
+ return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
958
+ }
959
+
960
+ template <>
961
+ EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
962
+ Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
963
+ Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
964
+ Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
965
+ Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
966
+ Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
967
+ res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
968
+ return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
969
+ }
970
+ template <>
971
+ EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
972
+ Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
973
+ Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
974
+ Packet4d res = _mm256_min_pd(lane0, lane1);
975
+ res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
976
+ return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
977
+ }
978
+
979
+ template <>
980
+ EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
981
+ Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
982
+ Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
983
+ Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
984
+ Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
985
+ Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
986
+ res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
987
+ return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
988
+ }
989
+ template <>
990
+ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
991
+ Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
992
+ Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
993
+ Packet4d res = _mm256_max_pd(lane0, lane1);
994
+ res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
995
+ return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
996
+ }
997
+
998
+ template <int Offset>
999
+ struct palign_impl<Offset, Packet16f> {
1000
+ static EIGEN_STRONG_INLINE void run(Packet16f& first,
1001
+ const Packet16f& second) {
1002
+ if (Offset != 0) {
1003
+ __m512i first_idx = _mm512_set_epi32(
1004
+ Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
1005
+ Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
1006
+ Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
1007
+
1008
+ __m512i second_idx =
1009
+ _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
1010
+ Offset - 5, Offset - 6, Offset - 7, Offset - 8,
1011
+ Offset - 9, Offset - 10, Offset - 11, Offset - 12,
1012
+ Offset - 13, Offset - 14, Offset - 15, Offset - 16);
1013
+
1014
+ unsigned short mask = 0xFFFF;
1015
+ mask <<= (16 - Offset);
1016
+
1017
+ first = _mm512_permutexvar_ps(first_idx, first);
1018
+ Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
1019
+ first = _mm512_mask_blend_ps(mask, first, tmp);
1020
+ }
1021
+ }
1022
+ };
1023
+ template <int Offset>
1024
+ struct palign_impl<Offset, Packet8d> {
1025
+ static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
1026
+ if (Offset != 0) {
1027
+ __m512i first_idx = _mm512_set_epi32(
1028
+ 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
1029
+ Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
1030
+
1031
+ __m512i second_idx = _mm512_set_epi32(
1032
+ 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
1033
+ Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
1034
+
1035
+ unsigned char mask = 0xFF;
1036
+ mask <<= (8 - Offset);
1037
+
1038
+ first = _mm512_permutexvar_pd(first_idx, first);
1039
+ Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
1040
+ first = _mm512_mask_blend_pd(mask, first, tmp);
1041
+ }
1042
+ }
1043
+ };
1044
+
1045
+
1046
+ #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
1047
+ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1048
+
1049
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
1050
+ __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1051
+ __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1052
+ __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1053
+ __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1054
+ __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1055
+ __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1056
+ __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1057
+ __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1058
+ __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
1059
+ __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
1060
+ __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
1061
+ __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
1062
+ __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
1063
+ __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
1064
+ __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
1065
+ __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
1066
+ __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1067
+ __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1068
+ __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1069
+ __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1070
+ __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1071
+ __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1072
+ __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1073
+ __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1074
+ __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1075
+ __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1076
+ __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1077
+ __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1078
+ __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1079
+ __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1080
+ __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1081
+ __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1082
+
1083
+ EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
1084
+ EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
1085
+ EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
1086
+ EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
1087
+ EIGEN_EXTRACT_8f_FROM_16f(S4, S4);
1088
+ EIGEN_EXTRACT_8f_FROM_16f(S5, S5);
1089
+ EIGEN_EXTRACT_8f_FROM_16f(S6, S6);
1090
+ EIGEN_EXTRACT_8f_FROM_16f(S7, S7);
1091
+ EIGEN_EXTRACT_8f_FROM_16f(S8, S8);
1092
+ EIGEN_EXTRACT_8f_FROM_16f(S9, S9);
1093
+ EIGEN_EXTRACT_8f_FROM_16f(S10, S10);
1094
+ EIGEN_EXTRACT_8f_FROM_16f(S11, S11);
1095
+ EIGEN_EXTRACT_8f_FROM_16f(S12, S12);
1096
+ EIGEN_EXTRACT_8f_FROM_16f(S13, S13);
1097
+ EIGEN_EXTRACT_8f_FROM_16f(S14, S14);
1098
+ EIGEN_EXTRACT_8f_FROM_16f(S15, S15);
1099
+
1100
+ PacketBlock<Packet8f, 32> tmp;
1101
+
1102
+ tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
1103
+ tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
1104
+ tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
1105
+ tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
1106
+ tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
1107
+ tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
1108
+ tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
1109
+ tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
1110
+
1111
+ tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
1112
+ tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
1113
+ tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
1114
+ tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
1115
+ tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
1116
+ tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
1117
+ tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
1118
+ tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
1119
+
1120
+ // Second set of _m256 outputs
1121
+ tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
1122
+ tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
1123
+ tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
1124
+ tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
1125
+ tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
1126
+ tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
1127
+ tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
1128
+ tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
1129
+
1130
+ tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
1131
+ tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
1132
+ tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
1133
+ tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
1134
+ tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
1135
+ tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
1136
+ tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
1137
+ tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
1138
+
1139
+ // Pack them into the output
1140
+ PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
1141
+ PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
1142
+ PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
1143
+ PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
1144
+
1145
+ PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
1146
+ PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
1147
+ PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
1148
+ PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
1149
+
1150
+ PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
1151
+ PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
1152
+ PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
1153
+ PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
1154
+
1155
+ PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
1156
+ PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
1157
+ PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
1158
+ PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
1159
+ }
1160
+ #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1161
+ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
1162
+ INPUT[2 * INDEX + STRIDE]);
1163
+
1164
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
1165
+ __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1166
+ __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1167
+ __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1168
+ __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1169
+
1170
+ __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1171
+ __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1172
+ __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1173
+ __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1174
+
1175
+ EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
1176
+ EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
1177
+ EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
1178
+ EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
1179
+
1180
+ PacketBlock<Packet8f, 8> tmp;
1181
+
1182
+ tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
1183
+ tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
1184
+ tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
1185
+ tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
1186
+
1187
+ tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
1188
+ tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
1189
+ tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
1190
+ tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
1191
+
1192
+ PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
1193
+ PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
1194
+ PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
1195
+ PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
1196
+ }
1197
+
1198
+ #define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
1199
+ OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
1200
+ OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
1201
+
1202
+ #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
1203
+ OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1204
+ OUTPUT[INDEX] = \
1205
+ _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1206
+
1207
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
1208
+ __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
1209
+ __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
1210
+ __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
1211
+ __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
1212
+
1213
+ PacketBlock<Packet4d, 8> tmp;
1214
+
1215
+ tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1216
+ _mm512_extractf64x4_pd(T2, 0), 0x20);
1217
+ tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1218
+ _mm512_extractf64x4_pd(T3, 0), 0x20);
1219
+ tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1220
+ _mm512_extractf64x4_pd(T2, 0), 0x31);
1221
+ tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1222
+ _mm512_extractf64x4_pd(T3, 0), 0x31);
1223
+
1224
+ tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1225
+ _mm512_extractf64x4_pd(T2, 1), 0x20);
1226
+ tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1227
+ _mm512_extractf64x4_pd(T3, 1), 0x20);
1228
+ tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1229
+ _mm512_extractf64x4_pd(T2, 1), 0x31);
1230
+ tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1231
+ _mm512_extractf64x4_pd(T3, 1), 0x31);
1232
+
1233
+ PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
1234
+ PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
1235
+ PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
1236
+ PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
1237
+ }
1238
+
1239
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
1240
+ __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1241
+ __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1242
+ __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
1243
+ __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
1244
+ __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
1245
+ __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
1246
+ __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
1247
+ __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
1248
+
1249
+ PacketBlock<Packet4d, 16> tmp;
1250
+
1251
+ tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1252
+ _mm512_extractf64x4_pd(T2, 0), 0x20);
1253
+ tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1254
+ _mm512_extractf64x4_pd(T3, 0), 0x20);
1255
+ tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1256
+ _mm512_extractf64x4_pd(T2, 0), 0x31);
1257
+ tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1258
+ _mm512_extractf64x4_pd(T3, 0), 0x31);
1259
+
1260
+ tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1261
+ _mm512_extractf64x4_pd(T2, 1), 0x20);
1262
+ tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1263
+ _mm512_extractf64x4_pd(T3, 1), 0x20);
1264
+ tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1265
+ _mm512_extractf64x4_pd(T2, 1), 0x31);
1266
+ tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1267
+ _mm512_extractf64x4_pd(T3, 1), 0x31);
1268
+
1269
+ tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1270
+ _mm512_extractf64x4_pd(T6, 0), 0x20);
1271
+ tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1272
+ _mm512_extractf64x4_pd(T7, 0), 0x20);
1273
+ tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1274
+ _mm512_extractf64x4_pd(T6, 0), 0x31);
1275
+ tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1276
+ _mm512_extractf64x4_pd(T7, 0), 0x31);
1277
+
1278
+ tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1279
+ _mm512_extractf64x4_pd(T6, 1), 0x20);
1280
+ tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1281
+ _mm512_extractf64x4_pd(T7, 1), 0x20);
1282
+ tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1283
+ _mm512_extractf64x4_pd(T6, 1), 0x31);
1284
+ tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1285
+ _mm512_extractf64x4_pd(T7, 1), 0x31);
1286
+
1287
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8);
1288
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8);
1289
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8);
1290
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8);
1291
+
1292
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8);
1293
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8);
1294
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
1295
+ PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
1296
+ }
1297
+ template <>
1298
+ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
1299
+ const Packet16f& /*thenPacket*/,
1300
+ const Packet16f& /*elsePacket*/) {
1301
+ assert(false && "To be implemented");
1302
+ return Packet16f();
1303
+ }
1304
+ template <>
1305
+ EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
1306
+ const Packet8d& /*thenPacket*/,
1307
+ const Packet8d& /*elsePacket*/) {
1308
+ assert(false && "To be implemented");
1309
+ return Packet8d();
1310
+ }
1311
+
1312
+ } // end namespace internal
1313
+
1314
+ } // end namespace Eigen
1315
+
1316
+ #endif // EIGEN_PACKET_MATH_AVX512_H