tomoto 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,439 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
11
+ #define EIGEN_MATH_FUNCTIONS_AVX_H
12
+
13
+ /* The sin, cos, exp, and log functions of this file are loosely derived from
14
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
15
+ */
16
+
17
+ namespace Eigen {
18
+
19
+ namespace internal {
20
+
21
+ inline Packet8i pshiftleft(Packet8i v, int n)
22
+ {
23
+ #ifdef EIGEN_VECTORIZE_AVX2
24
+ return _mm256_slli_epi32(v, n);
25
+ #else
26
+ __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
27
+ __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
28
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
29
+ #endif
30
+ }
31
+
32
+ inline Packet8f pshiftright(Packet8f v, int n)
33
+ {
34
+ #ifdef EIGEN_VECTORIZE_AVX2
35
+ return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
36
+ #else
37
+ __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
38
+ __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
39
+ return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
40
+ #endif
41
+ }
42
+
43
+ // Sine function
44
+ // Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
45
+ // evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
46
+ // are (anti-)symmetric and thus have only odd/even coefficients
47
+ template <>
48
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
49
+ psin<Packet8f>(const Packet8f& _x) {
50
+ Packet8f x = _x;
51
+
52
+ // Some useful values.
53
+ _EIGEN_DECLARE_CONST_Packet8i(one, 1);
54
+ _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
55
+ _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
56
+ _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
57
+ _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
58
+ _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
59
+ _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
60
+ _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
61
+ _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
62
+
63
+ // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
64
+ Packet8f z = pmul(x, p8f_one_over_pi);
65
+ Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
66
+ x = pmadd(shift, p8f_neg_pi_first, x);
67
+ x = pmadd(shift, p8f_neg_pi_second, x);
68
+ x = pmadd(shift, p8f_neg_pi_third, x);
69
+ z = pmul(x, p8f_four_over_pi);
70
+
71
+ // Make a mask for the entries that need flipping, i.e. wherever the shift
72
+ // is odd.
73
+ Packet8i shift_ints = _mm256_cvtps_epi32(shift);
74
+ Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
75
+ Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
76
+
77
+ // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
78
+ // is set to ones for that entry.
79
+ Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
80
+
81
+ // Evaluate the polynomial for the interval [1,3] in z.
82
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
83
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
84
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
85
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
86
+ Packet8f z_minus_two = psub(z, p8f_two);
87
+ Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
88
+ Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
89
+ right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
90
+ right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
91
+
92
+ // Evaluate the polynomial for the interval [-1,1] in z.
93
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
94
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
95
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
96
+ _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
97
+ Packet8f z2 = pmul(z, z);
98
+ Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
99
+ left = pmadd(left, z2, p8f_coeff_left_3);
100
+ left = pmadd(left, z2, p8f_coeff_left_1);
101
+ left = pmul(left, z);
102
+
103
+ // Assemble the results, i.e. select the left and right polynomials.
104
+ left = _mm256_andnot_ps(ival_mask, left);
105
+ right = _mm256_and_ps(ival_mask, right);
106
+ Packet8f res = _mm256_or_ps(left, right);
107
+
108
+ // Flip the sign on the odd intervals and return the result.
109
+ res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
110
+ return res;
111
+ }
112
+
113
+ // Natural logarithm
114
+ // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
115
+ // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
116
+ // be easily approximated by a polynomial centered on m=1 for stability.
117
+ // TODO(gonnet): Further reduce the interval allowing for lower-degree
118
+ // polynomial interpolants -> ... -> profit!
119
+ template <>
120
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
121
+ plog<Packet8f>(const Packet8f& _x) {
122
+ Packet8f x = _x;
123
+ _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
124
+ _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
125
+ _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
126
+
127
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
128
+
129
+ // The smallest non denormalized float number.
130
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
131
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
132
+
133
+ // Polynomial coefficients.
134
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
135
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
136
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
137
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
138
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
139
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
140
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
141
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
142
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
143
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
144
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
145
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
146
+
147
+ Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
148
+ Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
149
+
150
+ // Truncate input values to the minimum positive normal.
151
+ x = pmax(x, p8f_min_norm_pos);
152
+
153
+ Packet8f emm0 = pshiftright(x,23);
154
+ Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
155
+
156
+ // Set the exponents to -1, i.e. x are in the range [0.5,1).
157
+ x = _mm256_and_ps(x, p8f_inv_mant_mask);
158
+ x = _mm256_or_ps(x, p8f_half);
159
+
160
+ // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
161
+ // and shift by -1. The values are then centered around 0, which improves
162
+ // the stability of the polynomial evaluation.
163
+ // if( x < SQRTHF ) {
164
+ // e -= 1;
165
+ // x = x + x - 1.0;
166
+ // } else { x = x - 1.0; }
167
+ Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
168
+ Packet8f tmp = _mm256_and_ps(x, mask);
169
+ x = psub(x, p8f_1);
170
+ e = psub(e, _mm256_and_ps(p8f_1, mask));
171
+ x = padd(x, tmp);
172
+
173
+ Packet8f x2 = pmul(x, x);
174
+ Packet8f x3 = pmul(x2, x);
175
+
176
+ // Evaluate the polynomial approximant of degree 8 in three parts, probably
177
+ // to improve instruction-level parallelism.
178
+ Packet8f y, y1, y2;
179
+ y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
180
+ y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
181
+ y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
182
+ y = pmadd(y, x, p8f_cephes_log_p2);
183
+ y1 = pmadd(y1, x, p8f_cephes_log_p5);
184
+ y2 = pmadd(y2, x, p8f_cephes_log_p8);
185
+ y = pmadd(y, x3, y1);
186
+ y = pmadd(y, x3, y2);
187
+ y = pmul(y, x3);
188
+
189
+ // Add the logarithm of the exponent back to the result of the interpolation.
190
+ y1 = pmul(e, p8f_cephes_log_q1);
191
+ tmp = pmul(x2, p8f_half);
192
+ y = padd(y, y1);
193
+ x = psub(x, tmp);
194
+ y2 = pmul(e, p8f_cephes_log_q2);
195
+ x = padd(x, y);
196
+ x = padd(x, y2);
197
+
198
+ // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
199
+ return _mm256_or_ps(
200
+ _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
201
+ _mm256_and_ps(iszero_mask, p8f_minus_inf));
202
+ }
203
+
204
+ // Exponential function. Works by writing "x = m*log(2) + r" where
205
+ // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
206
+ // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
207
+ template <>
208
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
209
+ pexp<Packet8f>(const Packet8f& _x) {
210
+ _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
211
+ _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
212
+ _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
213
+
214
+ _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
215
+ _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
216
+
217
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
218
+
219
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
220
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
221
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
222
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
223
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
224
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
225
+
226
+ // Clamp x.
227
+ Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
228
+
229
+ // Express exp(x) as exp(m*ln(2) + r), start by extracting
230
+ // m = floor(x/ln(2) + 0.5).
231
+ Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
232
+
233
+ // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
234
+ // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
235
+ // truncation errors. Note that we don't use the "pmadd" function here to
236
+ // ensure that a precision-preserving FMA instruction is used.
237
+ #ifdef EIGEN_VECTORIZE_FMA
238
+ _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
239
+ Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
240
+ #else
241
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
242
+ _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
243
+ Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
244
+ r = psub(r, pmul(m, p8f_cephes_exp_C2));
245
+ #endif
246
+
247
+ Packet8f r2 = pmul(r, r);
248
+
249
+ // TODO(gonnet): Split into odd/even polynomials and try to exploit
250
+ // instruction-level parallelism.
251
+ Packet8f y = p8f_cephes_exp_p0;
252
+ y = pmadd(y, r, p8f_cephes_exp_p1);
253
+ y = pmadd(y, r, p8f_cephes_exp_p2);
254
+ y = pmadd(y, r, p8f_cephes_exp_p3);
255
+ y = pmadd(y, r, p8f_cephes_exp_p4);
256
+ y = pmadd(y, r, p8f_cephes_exp_p5);
257
+ y = pmadd(y, r2, r);
258
+ y = padd(y, p8f_1);
259
+
260
+ // Build emm0 = 2^m.
261
+ Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
262
+ emm0 = pshiftleft(emm0, 23);
263
+
264
+ // Return 2^m * exp(r).
265
+ return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
266
+ }
267
+
268
+ // Hyperbolic Tangent function.
269
+ template <>
270
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
271
+ ptanh<Packet8f>(const Packet8f& x) {
272
+ return internal::generic_fast_tanh_float(x);
273
+ }
274
+
275
+ template <>
276
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
277
+ pexp<Packet4d>(const Packet4d& _x) {
278
+ Packet4d x = _x;
279
+
280
+ _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
281
+ _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
282
+ _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
283
+
284
+ _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
285
+ _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
286
+
287
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
288
+
289
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
290
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
291
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
292
+
293
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
294
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
295
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
296
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
297
+
298
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
299
+ _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
300
+ _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
301
+
302
+ Packet4d tmp, fx;
303
+
304
+ // clamp x
305
+ x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
306
+ // Express exp(x) as exp(g + n*log(2)).
307
+ fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
308
+
309
+ // Get the integer modulus of log(2), i.e. the "n" described above.
310
+ fx = _mm256_floor_pd(fx);
311
+
312
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
313
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
314
+ // digits right.
315
+ tmp = pmul(fx, p4d_cephes_exp_C1);
316
+ Packet4d z = pmul(fx, p4d_cephes_exp_C2);
317
+ x = psub(x, tmp);
318
+ x = psub(x, z);
319
+
320
+ Packet4d x2 = pmul(x, x);
321
+
322
+ // Evaluate the numerator polynomial of the rational interpolant.
323
+ Packet4d px = p4d_cephes_exp_p0;
324
+ px = pmadd(px, x2, p4d_cephes_exp_p1);
325
+ px = pmadd(px, x2, p4d_cephes_exp_p2);
326
+ px = pmul(px, x);
327
+
328
+ // Evaluate the denominator polynomial of the rational interpolant.
329
+ Packet4d qx = p4d_cephes_exp_q0;
330
+ qx = pmadd(qx, x2, p4d_cephes_exp_q1);
331
+ qx = pmadd(qx, x2, p4d_cephes_exp_q2);
332
+ qx = pmadd(qx, x2, p4d_cephes_exp_q3);
333
+
334
+ // I don't really get this bit, copied from the SSE2 routines, so...
335
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
336
+ // rational interpolant?
337
+ x = _mm256_div_pd(px, psub(qx, px));
338
+ x = pmadd(p4d_2, x, p4d_1);
339
+
340
+ // Build e=2^n by constructing the exponents in a 128-bit vector and
341
+ // shifting them to where they belong in double-precision values.
342
+ __m128i emm0 = _mm256_cvtpd_epi32(fx);
343
+ emm0 = _mm_add_epi32(emm0, p4i_1023);
344
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
345
+ __m128i lo = _mm_slli_epi64(emm0, 52);
346
+ __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
347
+ __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
348
+ e = _mm256_insertf128_si256(e, hi, 1);
349
+
350
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
351
+ // non-finite values in the input.
352
+ return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
353
+ }
354
+
355
+ // Functions for sqrt.
356
+ // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
357
+ // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
358
+ // exact solution. It does not handle +inf, or denormalized numbers correctly.
359
+ // The main advantage of this approach is not just speed, but also the fact that
360
+ // it can be inlined and pipelined with other computations, further reducing its
361
+ // effective latency. This is similar to Quake3's fast inverse square root.
362
+ // For detail see here: http://www.beyond3d.com/content/articles/8/
363
+ #if EIGEN_FAST_MATH
364
+ template <>
365
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
366
+ psqrt<Packet8f>(const Packet8f& _x) {
367
+ Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
368
+ Packet8f denormal_mask = _mm256_and_ps(
369
+ _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
370
+ _CMP_LT_OQ),
371
+ _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
372
+
373
+ // Compute approximate reciprocal sqrt.
374
+ Packet8f x = _mm256_rsqrt_ps(_x);
375
+ // Do a single step of Newton's iteration.
376
+ x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
377
+ // Flush results for denormals to zero.
378
+ return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
379
+ }
380
+ #else
381
+ template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
382
+ Packet8f psqrt<Packet8f>(const Packet8f& x) {
383
+ return _mm256_sqrt_ps(x);
384
+ }
385
+ #endif
386
+ template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
387
+ Packet4d psqrt<Packet4d>(const Packet4d& x) {
388
+ return _mm256_sqrt_pd(x);
389
+ }
390
+ #if EIGEN_FAST_MATH
391
+
392
+ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
393
+ Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
394
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
395
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
396
+ _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
397
+ _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
398
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
399
+
400
+ Packet8f neg_half = pmul(_x, p8f_minus_half);
401
+
402
+ // select only the inverse sqrt of positive normal inputs (denormals are
403
+ // flushed to zero and cause infs as well).
404
+ Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
405
+ Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
406
+
407
+ // Fill in NaNs and Infs for the negative/zero entries.
408
+ Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
409
+ Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
410
+ Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
411
+ _mm256_and_ps(zero_mask, p8f_inf));
412
+
413
+ // Do a single step of Newton's iteration.
414
+ x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
415
+
416
+ // Insert NaNs and Infs in all the right places.
417
+ return _mm256_or_ps(x, infs_and_nans);
418
+ }
419
+
420
+ #else
421
+ template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
422
+ Packet8f prsqrt<Packet8f>(const Packet8f& x) {
423
+ _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
424
+ return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
425
+ }
426
+ #endif
427
+
428
+ template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
429
+ Packet4d prsqrt<Packet4d>(const Packet4d& x) {
430
+ _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
431
+ return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
432
+ }
433
+
434
+
435
+ } // end namespace internal
436
+
437
+ } // end namespace Eigen
438
+
439
+ #endif // EIGEN_MATH_FUNCTIONS_AVX_H
@@ -0,0 +1,637 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_AVX_H
11
+ #define EIGEN_PACKET_MATH_AVX_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
19
+ #endif
20
+
21
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
22
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
23
+ #endif
24
+
25
+ #ifdef __FMA__
26
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28
+ #endif
29
+ #endif
30
+
31
+ typedef __m256 Packet8f;
32
+ typedef __m256i Packet8i;
33
+ typedef __m256d Packet4d;
34
+
35
+ template<> struct is_arithmetic<__m256> { enum { value = true }; };
36
+ template<> struct is_arithmetic<__m256i> { enum { value = true }; };
37
+ template<> struct is_arithmetic<__m256d> { enum { value = true }; };
38
+
39
+ #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
40
+ const Packet8f p8f_##NAME = pset1<Packet8f>(X)
41
+
42
+ #define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
43
+ const Packet4d p4d_##NAME = pset1<Packet4d>(X)
44
+
45
+ #define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
46
+ const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
47
+
48
+ #define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
49
+ const Packet8i p8i_##NAME = pset1<Packet8i>(X)
50
+
51
+ // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
52
+ // to leverage AVX512 instructions.
53
+ #ifndef EIGEN_VECTORIZE_AVX512
54
+ template<> struct packet_traits<float> : default_packet_traits
55
+ {
56
+ typedef Packet8f type;
57
+ typedef Packet4f half;
58
+ enum {
59
+ Vectorizable = 1,
60
+ AlignedOnScalar = 1,
61
+ size=8,
62
+ HasHalfPacket = 1,
63
+
64
+ HasDiv = 1,
65
+ HasSin = EIGEN_FAST_MATH,
66
+ HasCos = 0,
67
+ HasLog = 1,
68
+ HasExp = 1,
69
+ HasSqrt = 1,
70
+ HasRsqrt = 1,
71
+ HasTanh = EIGEN_FAST_MATH,
72
+ HasBlend = 1,
73
+ HasRound = 1,
74
+ HasFloor = 1,
75
+ HasCeil = 1
76
+ };
77
+ };
78
+ template<> struct packet_traits<double> : default_packet_traits
79
+ {
80
+ typedef Packet4d type;
81
+ typedef Packet2d half;
82
+ enum {
83
+ Vectorizable = 1,
84
+ AlignedOnScalar = 1,
85
+ size=4,
86
+ HasHalfPacket = 1,
87
+
88
+ HasDiv = 1,
89
+ HasExp = 1,
90
+ HasSqrt = 1,
91
+ HasRsqrt = 1,
92
+ HasBlend = 1,
93
+ HasRound = 1,
94
+ HasFloor = 1,
95
+ HasCeil = 1
96
+ };
97
+ };
98
+ #endif
99
+
100
+ template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
101
+ template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
102
+
103
+ /* Proper support for integers is only provided by AVX2. In the meantime, we'll
104
+ use SSE instructions and packets to deal with integers.
105
+ template<> struct packet_traits<int> : default_packet_traits
106
+ {
107
+ typedef Packet8i type;
108
+ enum {
109
+ Vectorizable = 1,
110
+ AlignedOnScalar = 1,
111
+ size=8
112
+ };
113
+ };
114
+ */
115
+
116
+ template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
117
+ template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
118
+ template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
119
+
120
+ template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
121
+ template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
122
+ template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
123
+
124
+ template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
125
+ template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
126
+
127
+ template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
128
+ template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
129
+
130
+ template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
131
+ template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
132
+
133
+ template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
134
+ template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
135
+
136
+ template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
137
+ {
138
+ return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
139
+ }
140
+ template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
141
+ {
142
+ return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
143
+ }
144
+
145
+ template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
146
+ template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
147
+ template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
148
+
149
+ template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
150
+ template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
151
+
152
+
153
+ template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
154
+ template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
155
+ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
156
+ { eigen_assert(false && "packet integer division are not supported by AVX");
157
+ return pset1<Packet8i>(0);
158
+ }
159
+
160
+ #ifdef __FMA__
161
+ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
162
+ #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
163
+ // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
164
+ // and even register spilling with clang>=6.0 (bug 1637).
165
+ // Gcc stupidly generates a vfmadd132ps instruction.
166
+ // So let's enforce it to generate a vfmadd231ps instruction since the most common use
167
+ // case is to accumulate the result of the product.
168
+ Packet8f res = c;
169
+ __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
170
+ return res;
171
+ #else
172
+ return _mm256_fmadd_ps(a,b,c);
173
+ #endif
174
+ }
175
+ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
176
+ #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
177
+ // see above
178
+ Packet4d res = c;
179
+ __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
180
+ return res;
181
+ #else
182
+ return _mm256_fmadd_pd(a,b,c);
183
+ #endif
184
+ }
185
+ #endif
186
+
187
+ template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
188
+ template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
189
+
190
+ template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
191
+ template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
192
+
193
+ template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
194
+ template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
195
+
196
+ template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
197
+ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
198
+
199
+ template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
200
+ template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
201
+
202
+ template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
203
+ template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
204
+
205
+ template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
206
+ template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
207
+
208
+ template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
209
+ template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
210
+
211
+ template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
212
+ template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
213
+
214
+ template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
215
+ template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
216
+ template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
217
+
218
+ template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
219
+ template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
220
+ template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
221
+
222
+ // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
223
+ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
224
+ {
225
+ // TODO try to find a way to avoid the need of a temporary register
226
+ // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
227
+ // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
228
+ // return _mm256_unpacklo_ps(tmp,tmp);
229
+
230
+ // _mm256_insertf128_ps is very slow on Haswell, thus:
231
+ Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
232
+ // mimic an "inplace" permutation of the lower 128bits using a blend
233
+ tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
234
+ // then we can perform a consistent permutation on the global register to get everything in shape:
235
+ return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
236
+ }
237
+ // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
238
+ template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
239
+ {
240
+ Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
241
+ return _mm256_permute_pd(tmp, 3<<2);
242
+ }
243
+
244
+ // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
245
+ template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
246
+ {
247
+ Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
248
+ return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
249
+ }
250
+
251
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
252
+ template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
253
+ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
254
+
255
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
256
+ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
257
+ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
258
+
259
+ // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
260
+ // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
261
+ template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
262
+ {
263
+ return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
264
+ from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
265
+ }
266
+ template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
267
+ {
268
+ return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
269
+ }
270
+
271
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
272
+ {
273
+ __m128 low = _mm256_extractf128_ps(from, 0);
274
+ to[stride*0] = _mm_cvtss_f32(low);
275
+ to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
276
+ to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
277
+ to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
278
+
279
+ __m128 high = _mm256_extractf128_ps(from, 1);
280
+ to[stride*4] = _mm_cvtss_f32(high);
281
+ to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
282
+ to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
283
+ to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
284
+ }
285
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
286
+ {
287
+ __m128d low = _mm256_extractf128_pd(from, 0);
288
+ to[stride*0] = _mm_cvtsd_f64(low);
289
+ to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
290
+ __m128d high = _mm256_extractf128_pd(from, 1);
291
+ to[stride*2] = _mm_cvtsd_f64(high);
292
+ to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
293
+ }
294
+
295
+ template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
296
+ {
297
+ Packet8f pa = pset1<Packet8f>(a);
298
+ pstore(to, pa);
299
+ }
300
+ template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
301
+ {
302
+ Packet4d pa = pset1<Packet4d>(a);
303
+ pstore(to, pa);
304
+ }
305
+ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
306
+ {
307
+ Packet8i pa = pset1<Packet8i>(a);
308
+ pstore(to, pa);
309
+ }
310
+
311
+ #ifndef EIGEN_VECTORIZE_AVX512
312
+ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
313
+ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
314
+ template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
315
+ #endif
316
+
317
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
318
+ return _mm_cvtss_f32(_mm256_castps256_ps128(a));
319
+ }
320
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
321
+ return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
322
+ }
323
+ template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
324
+ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
325
+ }
326
+
327
+
328
+ template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
329
+ {
330
+ __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
331
+ return _mm256_permute2f128_ps(tmp, tmp, 1);
332
+ }
333
+ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
334
+ {
335
+ __m256d tmp = _mm256_shuffle_pd(a,a,5);
336
+ return _mm256_permute2f128_pd(tmp, tmp, 1);
337
+ #if 0
338
+ // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
339
+ // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
340
+ __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
341
+ return _mm256_permute_pd(swap_halves,5);
342
+ #endif
343
+ }
344
+
345
+ // pabs should be ok
346
+ template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
347
+ {
348
+ const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
349
+ return _mm256_and_ps(a,mask);
350
+ }
351
+ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
352
+ {
353
+ const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
354
+ return _mm256_and_pd(a,mask);
355
+ }
356
+
357
+ // preduxp should be ok
358
+ // FIXME: why is this ok? why isn't the simply implementation working as expected?
359
+ template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
360
+ {
361
+ __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
362
+ __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
363
+ __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
364
+ __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
365
+
366
+ __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
367
+ __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
368
+ __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
369
+ __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
370
+
371
+ __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
372
+ __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
373
+ __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
374
+ __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
375
+
376
+ __m256 sum1 = _mm256_add_ps(perm1, hsum5);
377
+ __m256 sum2 = _mm256_add_ps(perm2, hsum6);
378
+ __m256 sum3 = _mm256_add_ps(perm3, hsum7);
379
+ __m256 sum4 = _mm256_add_ps(perm4, hsum8);
380
+
381
+ __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
382
+ __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
383
+
384
+ __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
385
+ return final;
386
+ }
387
+ template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
388
+ {
389
+ Packet4d tmp0, tmp1;
390
+
391
+ tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
392
+ tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
393
+
394
+ tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
395
+ tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
396
+
397
+ return _mm256_blend_pd(tmp0, tmp1, 0xC);
398
+ }
399
+
400
+ template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
401
+ {
402
+ return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
403
+ }
404
+ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
405
+ {
406
+ return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
407
+ }
408
+
409
+ template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
410
+ {
411
+ return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
412
+ }
413
+
414
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
415
+ {
416
+ Packet8f tmp;
417
+ tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
418
+ tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
419
+ return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
420
+ }
421
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
422
+ {
423
+ Packet4d tmp;
424
+ tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
425
+ return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
426
+ }
427
+
428
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
429
+ {
430
+ Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
431
+ tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
432
+ return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
433
+ }
434
+ template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
435
+ {
436
+ Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
437
+ return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
438
+ }
439
+
440
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
441
+ {
442
+ Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
443
+ tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
444
+ return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
445
+ }
446
+
447
+ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
448
+ {
449
+ Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
450
+ return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
451
+ }
452
+
453
+
454
+ template<int Offset>
455
+ struct palign_impl<Offset,Packet8f>
456
+ {
457
+ static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
458
+ {
459
+ if (Offset==1)
460
+ {
461
+ first = _mm256_blend_ps(first, second, 1);
462
+ Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
463
+ Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
464
+ first = _mm256_blend_ps(tmp1, tmp2, 0x88);
465
+ }
466
+ else if (Offset==2)
467
+ {
468
+ first = _mm256_blend_ps(first, second, 3);
469
+ Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
470
+ Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
471
+ first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
472
+ }
473
+ else if (Offset==3)
474
+ {
475
+ first = _mm256_blend_ps(first, second, 7);
476
+ Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
477
+ Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
478
+ first = _mm256_blend_ps(tmp1, tmp2, 0xee);
479
+ }
480
+ else if (Offset==4)
481
+ {
482
+ first = _mm256_blend_ps(first, second, 15);
483
+ Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
484
+ Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
485
+ first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
486
+ }
487
+ else if (Offset==5)
488
+ {
489
+ first = _mm256_blend_ps(first, second, 31);
490
+ first = _mm256_permute2f128_ps(first, first, 1);
491
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
492
+ first = _mm256_permute2f128_ps(tmp, tmp, 1);
493
+ first = _mm256_blend_ps(tmp, first, 0x88);
494
+ }
495
+ else if (Offset==6)
496
+ {
497
+ first = _mm256_blend_ps(first, second, 63);
498
+ first = _mm256_permute2f128_ps(first, first, 1);
499
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
500
+ first = _mm256_permute2f128_ps(tmp, tmp, 1);
501
+ first = _mm256_blend_ps(tmp, first, 0xcc);
502
+ }
503
+ else if (Offset==7)
504
+ {
505
+ first = _mm256_blend_ps(first, second, 127);
506
+ first = _mm256_permute2f128_ps(first, first, 1);
507
+ Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
508
+ first = _mm256_permute2f128_ps(tmp, tmp, 1);
509
+ first = _mm256_blend_ps(tmp, first, 0xee);
510
+ }
511
+ }
512
+ };
513
+
514
+ template<int Offset>
515
+ struct palign_impl<Offset,Packet4d>
516
+ {
517
+ static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
518
+ {
519
+ if (Offset==1)
520
+ {
521
+ first = _mm256_blend_pd(first, second, 1);
522
+ __m256d tmp = _mm256_permute_pd(first, 5);
523
+ first = _mm256_permute2f128_pd(tmp, tmp, 1);
524
+ first = _mm256_blend_pd(tmp, first, 0xA);
525
+ }
526
+ else if (Offset==2)
527
+ {
528
+ first = _mm256_blend_pd(first, second, 3);
529
+ first = _mm256_permute2f128_pd(first, first, 1);
530
+ }
531
+ else if (Offset==3)
532
+ {
533
+ first = _mm256_blend_pd(first, second, 7);
534
+ __m256d tmp = _mm256_permute_pd(first, 5);
535
+ first = _mm256_permute2f128_pd(tmp, tmp, 1);
536
+ first = _mm256_blend_pd(tmp, first, 5);
537
+ }
538
+ }
539
+ };
540
+
541
+ EIGEN_DEVICE_FUNC inline void
542
+ ptranspose(PacketBlock<Packet8f,8>& kernel) {
543
+ __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
544
+ __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
545
+ __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
546
+ __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
547
+ __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
548
+ __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
549
+ __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
550
+ __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
551
+ __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
552
+ __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
553
+ __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
554
+ __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
555
+ __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
556
+ __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
557
+ __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
558
+ __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
559
+ kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
560
+ kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
561
+ kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
562
+ kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
563
+ kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
564
+ kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
565
+ kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
566
+ kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
567
+ }
568
+
569
+ EIGEN_DEVICE_FUNC inline void
570
+ ptranspose(PacketBlock<Packet8f,4>& kernel) {
571
+ __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
572
+ __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
573
+ __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
574
+ __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
575
+
576
+ __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
577
+ __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
578
+ __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
579
+ __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
580
+
581
+ kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
582
+ kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
583
+ kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
584
+ kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
585
+ }
586
+
587
+ EIGEN_DEVICE_FUNC inline void
588
+ ptranspose(PacketBlock<Packet4d,4>& kernel) {
589
+ __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
590
+ __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
591
+ __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
592
+ __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
593
+
594
+ kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
595
+ kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
596
+ kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
597
+ kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
598
+ }
599
+
600
+ template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
601
+ const __m256 zero = _mm256_setzero_ps();
602
+ const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
603
+ __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
604
+ return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
605
+ }
606
+ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
607
+ const __m256d zero = _mm256_setzero_pd();
608
+ const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
609
+ __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
610
+ return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
611
+ }
612
+
613
+ template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
614
+ {
615
+ return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
616
+ }
617
+
618
+ template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
619
+ {
620
+ return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
621
+ }
622
+
623
+ template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
624
+ {
625
+ return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
626
+ }
627
+
628
+ template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
629
+ {
630
+ return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
631
+ }
632
+
633
+ } // end namespace internal
634
+
635
+ } // end namespace Eigen
636
+
637
+ #endif // EIGEN_PACKET_MATH_AVX_H