tomoto 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,895 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_SSE_H
11
+ #define EIGEN_PACKET_MATH_SSE_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
19
+ #endif
20
+
21
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
22
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
23
+ #endif
24
+
25
+ #ifdef __FMA__
26
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
28
+ #endif
29
+ #endif
30
+
31
+ #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
32
+ // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
33
+ // have overloads for both types without linking error.
34
+ // One solution is to increase ABI version using -fabi-version=4 (or greater).
35
+ // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
36
+ // structure:
37
+ template<typename T>
38
+ struct eigen_packet_wrapper
39
+ {
40
+ EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
41
+ EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
42
+ EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
43
+ EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
44
+ EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
45
+ m_val = v;
46
+ return *this;
47
+ }
48
+
49
+ T m_val;
50
+ };
51
+ typedef eigen_packet_wrapper<__m128> Packet4f;
52
+ typedef eigen_packet_wrapper<__m128i> Packet4i;
53
+ typedef eigen_packet_wrapper<__m128d> Packet2d;
54
+ #else
55
+ typedef __m128 Packet4f;
56
+ typedef __m128i Packet4i;
57
+ typedef __m128d Packet2d;
58
+ #endif
59
+
60
+ template<> struct is_arithmetic<__m128> { enum { value = true }; };
61
+ template<> struct is_arithmetic<__m128i> { enum { value = true }; };
62
+ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
63
+
64
+ #define vec4f_swizzle1(v,p,q,r,s) \
65
+ (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
66
+
67
+ #define vec4i_swizzle1(v,p,q,r,s) \
68
+ (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
69
+
70
+ #define vec2d_swizzle1(v,p,q) \
71
+ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
72
+
73
+ #define vec4f_swizzle2(a,b,p,q,r,s) \
74
+ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
75
+
76
+ #define vec4i_swizzle2(a,b,p,q,r,s) \
77
+ (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
78
+
79
+ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
80
+ const Packet4f p4f_##NAME = pset1<Packet4f>(X)
81
+
82
+ #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
83
+ const Packet2d p2d_##NAME = pset1<Packet2d>(X)
84
+
85
+ #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
86
+ const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
87
+
88
+ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
89
+ const Packet4i p4i_##NAME = pset1<Packet4i>(X)
90
+
91
+
92
+ // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
93
+ // to leverage AVX instructions.
94
+ #ifndef EIGEN_VECTORIZE_AVX
95
+ template<> struct packet_traits<float> : default_packet_traits
96
+ {
97
+ typedef Packet4f type;
98
+ typedef Packet4f half;
99
+ enum {
100
+ Vectorizable = 1,
101
+ AlignedOnScalar = 1,
102
+ size=4,
103
+ HasHalfPacket = 0,
104
+
105
+ HasDiv = 1,
106
+ HasSin = EIGEN_FAST_MATH,
107
+ HasCos = EIGEN_FAST_MATH,
108
+ HasLog = 1,
109
+ HasExp = 1,
110
+ HasSqrt = 1,
111
+ HasRsqrt = 1,
112
+ HasTanh = EIGEN_FAST_MATH,
113
+ HasBlend = 1
114
+
115
+ #ifdef EIGEN_VECTORIZE_SSE4_1
116
+ ,
117
+ HasRound = 1,
118
+ HasFloor = 1,
119
+ HasCeil = 1
120
+ #endif
121
+ };
122
+ };
123
+ template<> struct packet_traits<double> : default_packet_traits
124
+ {
125
+ typedef Packet2d type;
126
+ typedef Packet2d half;
127
+ enum {
128
+ Vectorizable = 1,
129
+ AlignedOnScalar = 1,
130
+ size=2,
131
+ HasHalfPacket = 0,
132
+
133
+ HasDiv = 1,
134
+ HasExp = 1,
135
+ HasSqrt = 1,
136
+ HasRsqrt = 1,
137
+ HasBlend = 1
138
+
139
+ #ifdef EIGEN_VECTORIZE_SSE4_1
140
+ ,
141
+ HasRound = 1,
142
+ HasFloor = 1,
143
+ HasCeil = 1
144
+ #endif
145
+ };
146
+ };
147
+ #endif
148
+ template<> struct packet_traits<int> : default_packet_traits
149
+ {
150
+ typedef Packet4i type;
151
+ typedef Packet4i half;
152
+ enum {
153
+ Vectorizable = 1,
154
+ AlignedOnScalar = 1,
155
+ size=4,
156
+
157
+ HasBlend = 1
158
+ };
159
+ };
160
+
161
+ template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
162
+ template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
163
+ template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
164
+
165
+ #ifndef EIGEN_VECTORIZE_AVX
166
+ template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
167
+ template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
168
+ #endif
169
+
170
+ #if EIGEN_COMP_MSVC==1500
171
+ // Workaround MSVC 9 internal compiler error.
172
+ // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
173
+ // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
174
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); }
175
+ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
176
+ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
177
+ #else
178
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
179
+ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
180
+ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
181
+ #endif
182
+
183
+ // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
184
+ // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
185
+ // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
186
+ // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
187
+ // Also note that with AVX, we want it to generate a vbroadcastss.
188
+ #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
189
+ template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
190
+ return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
191
+ }
192
+ #endif
193
+
194
+ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
195
+ template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
196
+ template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
197
+
198
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
199
+ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
200
+ template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
201
+
202
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
203
+ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
204
+ template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
205
+
206
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
207
+ {
208
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
209
+ return _mm_xor_ps(a,mask);
210
+ }
211
+ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
212
+ {
213
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
214
+ return _mm_xor_pd(a,mask);
215
+ }
216
+ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
217
+ {
218
+ return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
219
+ }
220
+
221
+ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
222
+ template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
223
+ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
224
+
225
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
226
+ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
227
+ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
228
+ {
229
+ #ifdef EIGEN_VECTORIZE_SSE4_1
230
+ return _mm_mullo_epi32(a,b);
231
+ #else
232
+ // this version is slightly faster than 4 scalar products
233
+ return vec4i_swizzle1(
234
+ vec4i_swizzle2(
235
+ _mm_mul_epu32(a,b),
236
+ _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
237
+ vec4i_swizzle1(b,1,0,3,2)),
238
+ 0,2,0,2),
239
+ 0,2,1,3);
240
+ #endif
241
+ }
242
+
243
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
244
+ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
245
+
246
+ // for some weird raisons, it has to be overloaded for packet of integers
247
+ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
248
+ #ifdef __FMA__
249
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
250
+ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
251
+ #endif
252
+
253
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
254
+ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
255
+ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
256
+ {
257
+ #ifdef EIGEN_VECTORIZE_SSE4_1
258
+ return _mm_min_epi32(a,b);
259
+ #else
260
+ // after some bench, this version *is* faster than a scalar implementation
261
+ Packet4i mask = _mm_cmplt_epi32(a,b);
262
+ return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
263
+ #endif
264
+ }
265
+
266
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
267
+ template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
268
+ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
269
+ {
270
+ #ifdef EIGEN_VECTORIZE_SSE4_1
271
+ return _mm_max_epi32(a,b);
272
+ #else
273
+ // after some bench, this version *is* faster than a scalar implementation
274
+ Packet4i mask = _mm_cmpgt_epi32(a,b);
275
+ return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
276
+ #endif
277
+ }
278
+
279
+ #ifdef EIGEN_VECTORIZE_SSE4_1
280
+ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
281
+ template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
282
+
283
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
284
+ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
285
+
286
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
287
+ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
288
+ #endif
289
+
290
+ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
291
+ template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
292
+ template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
293
+
294
+ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
295
+ template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
296
+ template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
297
+
298
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
299
+ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
300
+ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
301
+
302
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
303
+ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
304
+ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
305
+
306
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
307
+ template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
308
+ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
309
+
310
+ #if EIGEN_COMP_MSVC
311
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
312
+ EIGEN_DEBUG_UNALIGNED_LOAD
313
+ #if (EIGEN_COMP_MSVC==1600)
314
+ // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
315
+ // (i.e., it does not generate an unaligned load!!
316
+ __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
317
+ res = _mm_loadh_pi(res, (const __m64*)(from+2));
318
+ return res;
319
+ #else
320
+ return _mm_loadu_ps(from);
321
+ #endif
322
+ }
323
+ #else
324
+ // NOTE: with the code below, MSVC's compiler crashes!
325
+
326
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
327
+ {
328
+ EIGEN_DEBUG_UNALIGNED_LOAD
329
+ return _mm_loadu_ps(from);
330
+ }
331
+ #endif
332
+
333
+ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
334
+ {
335
+ EIGEN_DEBUG_UNALIGNED_LOAD
336
+ return _mm_loadu_pd(from);
337
+ }
338
+ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
339
+ {
340
+ EIGEN_DEBUG_UNALIGNED_LOAD
341
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
342
+ }
343
+
344
+
345
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
346
+ {
347
+ return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
348
+ }
349
+ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
350
+ { return pset1<Packet2d>(from[0]); }
351
+ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
352
+ {
353
+ Packet4i tmp;
354
+ tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
355
+ return vec4i_swizzle1(tmp, 0, 0, 1, 1);
356
+ }
357
+
358
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
359
+ template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
360
+ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
361
+
362
+ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
363
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
364
+ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
365
+
366
+ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
367
+ {
368
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
369
+ }
370
+ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
371
+ {
372
+ return _mm_set_pd(from[1*stride], from[0*stride]);
373
+ }
374
+ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
375
+ {
376
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
377
+ }
378
+
379
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
380
+ {
381
+ to[stride*0] = _mm_cvtss_f32(from);
382
+ to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
383
+ to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
384
+ to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
385
+ }
386
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
387
+ {
388
+ to[stride*0] = _mm_cvtsd_f64(from);
389
+ to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
390
+ }
391
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
392
+ {
393
+ to[stride*0] = _mm_cvtsi128_si32(from);
394
+ to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
395
+ to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
396
+ to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
397
+ }
398
+
399
+ // some compilers might be tempted to perform multiple moves instead of using a vector path.
400
+ template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
401
+ {
402
+ Packet4f pa = _mm_set_ss(a);
403
+ pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
404
+ }
405
+ // some compilers might be tempted to perform multiple moves instead of using a vector path.
406
+ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
407
+ {
408
+ Packet2d pa = _mm_set_sd(a);
409
+ pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
410
+ }
411
+
412
+ #if EIGEN_COMP_PGI
413
+ typedef const void * SsePrefetchPtrType;
414
+ #else
415
+ typedef const char * SsePrefetchPtrType;
416
+ #endif
417
+
418
+ #ifndef EIGEN_VECTORIZE_AVX
419
+ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
420
+ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
421
+ template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
422
+ #endif
423
+
424
+ #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
425
+ // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
426
+ // Direct of the struct members fixed bug #62.
427
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
428
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
429
+ template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
430
+ #elif EIGEN_COMP_MSVC_STRICT
431
+ // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
432
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
433
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
434
+ template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
435
+ #else
436
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
437
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
438
+ template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
439
+ #endif
440
+
441
+ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
442
+ { return _mm_shuffle_ps(a,a,0x1B); }
443
+ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
444
+ { return _mm_shuffle_pd(a,a,0x1); }
445
+ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
446
+ { return _mm_shuffle_epi32(a,0x1B); }
447
+
448
+ template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
449
+ {
450
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
451
+ return _mm_and_ps(a,mask);
452
+ }
453
+ template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
454
+ {
455
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
456
+ return _mm_and_pd(a,mask);
457
+ }
458
+ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
459
+ {
460
+ #ifdef EIGEN_VECTORIZE_SSSE3
461
+ return _mm_abs_epi32(a);
462
+ #else
463
+ Packet4i aux = _mm_srai_epi32(a,31);
464
+ return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
465
+ #endif
466
+ }
467
+
468
+ // with AVX, the default implementations based on pload1 are faster
469
+ #ifndef __AVX__
470
+ template<> EIGEN_STRONG_INLINE void
471
+ pbroadcast4<Packet4f>(const float *a,
472
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
473
+ {
474
+ a3 = pload<Packet4f>(a);
475
+ a0 = vec4f_swizzle1(a3, 0,0,0,0);
476
+ a1 = vec4f_swizzle1(a3, 1,1,1,1);
477
+ a2 = vec4f_swizzle1(a3, 2,2,2,2);
478
+ a3 = vec4f_swizzle1(a3, 3,3,3,3);
479
+ }
480
+ template<> EIGEN_STRONG_INLINE void
481
+ pbroadcast4<Packet2d>(const double *a,
482
+ Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
483
+ {
484
+ #ifdef EIGEN_VECTORIZE_SSE3
485
+ a0 = _mm_loaddup_pd(a+0);
486
+ a1 = _mm_loaddup_pd(a+1);
487
+ a2 = _mm_loaddup_pd(a+2);
488
+ a3 = _mm_loaddup_pd(a+3);
489
+ #else
490
+ a1 = pload<Packet2d>(a);
491
+ a0 = vec2d_swizzle1(a1, 0,0);
492
+ a1 = vec2d_swizzle1(a1, 1,1);
493
+ a3 = pload<Packet2d>(a+2);
494
+ a2 = vec2d_swizzle1(a3, 0,0);
495
+ a3 = vec2d_swizzle1(a3, 1,1);
496
+ #endif
497
+ }
498
+ #endif
499
+
500
+ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
501
+ {
502
+ vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
503
+ vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
504
+ vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
505
+ vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
506
+ }
507
+
508
+ #ifdef EIGEN_VECTORIZE_SSE3
509
+ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
510
+ {
511
+ return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
512
+ }
513
+
514
+ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
515
+ {
516
+ return _mm_hadd_pd(vecs[0], vecs[1]);
517
+ }
518
+
519
+ #else
520
+ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
521
+ {
522
+ Packet4f tmp0, tmp1, tmp2;
523
+ tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
524
+ tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
525
+ tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
526
+ tmp0 = _mm_add_ps(tmp0, tmp1);
527
+ tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
528
+ tmp1 = _mm_add_ps(tmp1, tmp2);
529
+ tmp2 = _mm_movehl_ps(tmp1, tmp0);
530
+ tmp0 = _mm_movelh_ps(tmp0, tmp1);
531
+ return _mm_add_ps(tmp0, tmp2);
532
+ }
533
+
534
+ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
535
+ {
536
+ return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
537
+ }
538
+ #endif // SSE3
539
+
540
+ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
541
+ {
542
+ // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
543
+ // (from Nehalem to Haswell)
544
+ // #ifdef EIGEN_VECTORIZE_SSE3
545
+ // Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
546
+ // return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
547
+ // #else
548
+ Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
549
+ return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
550
+ // #endif
551
+ }
552
+
553
+ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
554
+ {
555
+ // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
556
+ // (from Nehalem to Haswell)
557
+ // #ifdef EIGEN_VECTORIZE_SSE3
558
+ // return pfirst<Packet2d>(_mm_hadd_pd(a, a));
559
+ // #else
560
+ return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
561
+ // #endif
562
+ }
563
+
564
+ #ifdef EIGEN_VECTORIZE_SSSE3
565
+ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
566
+ {
567
+ return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
568
+ }
569
+ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
570
+ {
571
+ Packet4i tmp0 = _mm_hadd_epi32(a,a);
572
+ return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
573
+ }
574
+ #else
575
+ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
576
+ {
577
+ Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
578
+ return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
579
+ }
580
+
581
+ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
582
+ {
583
+ Packet4i tmp0, tmp1, tmp2;
584
+ tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
585
+ tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
586
+ tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
587
+ tmp0 = _mm_add_epi32(tmp0, tmp1);
588
+ tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
589
+ tmp1 = _mm_add_epi32(tmp1, tmp2);
590
+ tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
591
+ tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
592
+ return _mm_add_epi32(tmp0, tmp2);
593
+ }
594
+ #endif
595
+ // Other reduction functions:
596
+
597
+ // mul
598
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
599
+ {
600
+ Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
601
+ return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
602
+ }
603
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
604
+ {
605
+ return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
606
+ }
607
+ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
608
+ {
609
+ // after some experiments, it is seems this is the fastest way to implement it
610
+ // for GCC (eg., reusing pmul is very slow !)
611
+ // TODO try to call _mm_mul_epu32 directly
612
+ EIGEN_ALIGN16 int aux[4];
613
+ pstore(aux, a);
614
+ return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
615
+ }
616
+
617
+ // min
618
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
619
+ {
620
+ Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
621
+ return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
622
+ }
623
+ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
624
+ {
625
+ return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
626
+ }
627
+ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
628
+ {
629
+ #ifdef EIGEN_VECTORIZE_SSE4_1
630
+ Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
631
+ return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
632
+ #else
633
+ // after some experiments, it is seems this is the fastest way to implement it
634
+ // for GCC (eg., it does not like using std::min after the pstore !!)
635
+ EIGEN_ALIGN16 int aux[4];
636
+ pstore(aux, a);
637
+ int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
638
+ int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
639
+ return aux0<aux2 ? aux0 : aux2;
640
+ #endif // EIGEN_VECTORIZE_SSE4_1
641
+ }
642
+
643
+ // max
644
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
645
+ {
646
+ Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
647
+ return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
648
+ }
649
+ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
650
+ {
651
+ return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
652
+ }
653
+ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
654
+ {
655
+ #ifdef EIGEN_VECTORIZE_SSE4_1
656
+ Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
657
+ return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
658
+ #else
659
+ // after some experiments, it is seems this is the fastest way to implement it
660
+ // for GCC (eg., it does not like using std::min after the pstore !!)
661
+ EIGEN_ALIGN16 int aux[4];
662
+ pstore(aux, a);
663
+ int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
664
+ int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
665
+ return aux0>aux2 ? aux0 : aux2;
666
+ #endif // EIGEN_VECTORIZE_SSE4_1
667
+ }
668
+
669
+ #if EIGEN_COMP_GNUC
670
+ // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
671
+ // {
672
+ // Packet4f res = b;
673
+ // asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
674
+ // return res;
675
+ // }
676
+ // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
677
+ // {
678
+ // Packet4i res = a;
679
+ // asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
680
+ // return res;
681
+ // }
682
+ #endif
683
+
684
+ #ifdef EIGEN_VECTORIZE_SSSE3
685
+ // SSSE3 versions
686
+ template<int Offset>
687
+ struct palign_impl<Offset,Packet4f>
688
+ {
689
+ static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
690
+ {
691
+ if (Offset!=0)
692
+ first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
693
+ }
694
+ };
695
+
696
+ template<int Offset>
697
+ struct palign_impl<Offset,Packet4i>
698
+ {
699
+ static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
700
+ {
701
+ if (Offset!=0)
702
+ first = _mm_alignr_epi8(second,first, Offset*4);
703
+ }
704
+ };
705
+
706
+ template<int Offset>
707
+ struct palign_impl<Offset,Packet2d>
708
+ {
709
+ static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
710
+ {
711
+ if (Offset==1)
712
+ first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
713
+ }
714
+ };
715
+ #else
716
+ // SSE2 versions
717
+ template<int Offset>
718
+ struct palign_impl<Offset,Packet4f>
719
+ {
720
+ static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
721
+ {
722
+ if (Offset==1)
723
+ {
724
+ first = _mm_move_ss(first,second);
725
+ first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
726
+ }
727
+ else if (Offset==2)
728
+ {
729
+ first = _mm_movehl_ps(first,first);
730
+ first = _mm_movelh_ps(first,second);
731
+ }
732
+ else if (Offset==3)
733
+ {
734
+ first = _mm_move_ss(first,second);
735
+ first = _mm_shuffle_ps(first,second,0x93);
736
+ }
737
+ }
738
+ };
739
+
740
+ template<int Offset>
741
+ struct palign_impl<Offset,Packet4i>
742
+ {
743
+ static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
744
+ {
745
+ if (Offset==1)
746
+ {
747
+ first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
748
+ first = _mm_shuffle_epi32(first,0x39);
749
+ }
750
+ else if (Offset==2)
751
+ {
752
+ first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
753
+ first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
754
+ }
755
+ else if (Offset==3)
756
+ {
757
+ first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
758
+ first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
759
+ }
760
+ }
761
+ };
762
+
763
+ template<int Offset>
764
+ struct palign_impl<Offset,Packet2d>
765
+ {
766
+ static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
767
+ {
768
+ if (Offset==1)
769
+ {
770
+ first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
771
+ first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
772
+ }
773
+ }
774
+ };
775
+ #endif
776
+
777
+ EIGEN_DEVICE_FUNC inline void
778
+ ptranspose(PacketBlock<Packet4f,4>& kernel) {
779
+ _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
780
+ }
781
+
782
+ EIGEN_DEVICE_FUNC inline void
783
+ ptranspose(PacketBlock<Packet2d,2>& kernel) {
784
+ __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
785
+ kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
786
+ kernel.packet[1] = tmp;
787
+ }
788
+
789
+ EIGEN_DEVICE_FUNC inline void
790
+ ptranspose(PacketBlock<Packet4i,4>& kernel) {
791
+ __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
792
+ __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
793
+ __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
794
+ __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
795
+
796
+ kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
797
+ kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
798
+ kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
799
+ kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
800
+ }
801
+
802
+ template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
803
+ const __m128i zero = _mm_setzero_si128();
804
+ const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
805
+ __m128i false_mask = _mm_cmpeq_epi32(select, zero);
806
+ #ifdef EIGEN_VECTORIZE_SSE4_1
807
+ return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
808
+ #else
809
+ return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
810
+ #endif
811
+ }
812
+ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
813
+ const __m128 zero = _mm_setzero_ps();
814
+ const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
815
+ __m128 false_mask = _mm_cmpeq_ps(select, zero);
816
+ #ifdef EIGEN_VECTORIZE_SSE4_1
817
+ return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
818
+ #else
819
+ return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
820
+ #endif
821
+ }
822
+ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
823
+ const __m128d zero = _mm_setzero_pd();
824
+ const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
825
+ __m128d false_mask = _mm_cmpeq_pd(select, zero);
826
+ #ifdef EIGEN_VECTORIZE_SSE4_1
827
+ return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
828
+ #else
829
+ return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
830
+ #endif
831
+ }
832
+
833
+ template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
834
+ {
835
+ #ifdef EIGEN_VECTORIZE_SSE4_1
836
+ return _mm_blend_ps(a,pset1<Packet4f>(b),1);
837
+ #else
838
+ return _mm_move_ss(a, _mm_load_ss(&b));
839
+ #endif
840
+ }
841
+
842
+ template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
843
+ {
844
+ #ifdef EIGEN_VECTORIZE_SSE4_1
845
+ return _mm_blend_pd(a,pset1<Packet2d>(b),1);
846
+ #else
847
+ return _mm_move_sd(a, _mm_load_sd(&b));
848
+ #endif
849
+ }
850
+
851
+ template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
852
+ {
853
+ #ifdef EIGEN_VECTORIZE_SSE4_1
854
+ return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3));
855
+ #else
856
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF));
857
+ return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b)));
858
+ #endif
859
+ }
860
+
861
+ template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
862
+ {
863
+ #ifdef EIGEN_VECTORIZE_SSE4_1
864
+ return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1));
865
+ #else
866
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF));
867
+ return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b)));
868
+ #endif
869
+ }
870
+
871
+ // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
872
+ #ifdef __FMA__
873
+ template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
874
+ return ::fmaf(a,b,c);
875
+ }
876
+ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
877
+ return ::fma(a,b,c);
878
+ }
879
+ #endif
880
+
881
+ } // end namespace internal
882
+
883
+ } // end namespace Eigen
884
+
885
+ #if EIGEN_COMP_PGI
886
+ // PGI++ does not define the following intrinsics in C++ mode.
887
+ static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
888
+ static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
889
+ static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
890
+ static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
891
+ static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
892
+ static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
893
+ #endif
894
+
895
+ #endif // EIGEN_PACKET_MATH_SSE_H