tomoto 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,122 @@
1
+ /*
2
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without modification,
5
+ are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+ * Redistributions in binary form must reproduce the above copyright notice,
10
+ this list of conditions and the following disclaimer in the documentation
11
+ and/or other materials provided with the distribution.
12
+ * Neither the name of Intel Corporation nor the names of its contributors may
13
+ be used to endorse or promote products derived from this software without
14
+ specific prior written permission.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ ********************************************************************************
28
+ * Content : Eigen bindings to BLAS F77
29
+ * General matrix-matrix product functionality based on ?GEMM.
30
+ ********************************************************************************
31
+ */
32
+
33
+ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
34
+ #define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
35
+
36
+ namespace Eigen {
37
+
38
+ namespace internal {
39
+
40
+ /**********************************************************************
41
+ * This file implements general matrix-matrix multiplication using BLAS
42
+ * gemm function via partial specialization of
43
+ * general_matrix_matrix_product::run(..) method for float, double,
44
+ * std::complex<float> and std::complex<double> types
45
+ **********************************************************************/
46
+
47
+ // gemm specialization
48
+
49
+ #define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
50
+ template< \
51
+ typename Index, \
52
+ int LhsStorageOrder, bool ConjugateLhs, \
53
+ int RhsStorageOrder, bool ConjugateRhs> \
54
+ struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
55
+ { \
56
+ typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
57
+ \
58
+ static void run(Index rows, Index cols, Index depth, \
59
+ const EIGTYPE* _lhs, Index lhsStride, \
60
+ const EIGTYPE* _rhs, Index rhsStride, \
61
+ EIGTYPE* res, Index resStride, \
62
+ EIGTYPE alpha, \
63
+ level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
64
+ GemmParallelInfo<Index>* /*info = 0*/) \
65
+ { \
66
+ using std::conj; \
67
+ \
68
+ char transa, transb; \
69
+ BlasIndex m, n, k, lda, ldb, ldc; \
70
+ const EIGTYPE *a, *b; \
71
+ EIGTYPE beta(1); \
72
+ MatrixX##EIGPREFIX a_tmp, b_tmp; \
73
+ \
74
+ /* Set transpose options */ \
75
+ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
76
+ transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
77
+ \
78
+ /* Set m, n, k */ \
79
+ m = convert_index<BlasIndex>(rows); \
80
+ n = convert_index<BlasIndex>(cols); \
81
+ k = convert_index<BlasIndex>(depth); \
82
+ \
83
+ /* Set lda, ldb, ldc */ \
84
+ lda = convert_index<BlasIndex>(lhsStride); \
85
+ ldb = convert_index<BlasIndex>(rhsStride); \
86
+ ldc = convert_index<BlasIndex>(resStride); \
87
+ \
88
+ /* Set a, b, c */ \
89
+ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
90
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
91
+ a_tmp = lhs.conjugate(); \
92
+ a = a_tmp.data(); \
93
+ lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
94
+ } else a = _lhs; \
95
+ \
96
+ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
97
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
98
+ b_tmp = rhs.conjugate(); \
99
+ b = b_tmp.data(); \
100
+ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
101
+ } else b = _rhs; \
102
+ \
103
+ BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
104
+ }};
105
+
106
+ #ifdef EIGEN_USE_MKL
107
+ GEMM_SPECIALIZATION(double, d, double, dgemm)
108
+ GEMM_SPECIALIZATION(float, f, float, sgemm)
109
+ GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
110
+ GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
111
+ #else
112
+ GEMM_SPECIALIZATION(double, d, double, dgemm_)
113
+ GEMM_SPECIALIZATION(float, f, float, sgemm_)
114
+ GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
115
+ GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
116
+ #endif
117
+
118
+ } // end namespase internal
119
+
120
+ } // end namespace Eigen
121
+
122
+ #endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
@@ -0,0 +1,619 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11
+ #define EIGEN_GENERAL_MATRIX_VECTOR_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ /* Optimized col-major matrix * vector product:
18
+ * This algorithm processes 4 columns at onces that allows to both reduce
19
+ * the number of load/stores of the result by a factor 4 and to reduce
20
+ * the instruction dependency. Moreover, we know that all bands have the
21
+ * same alignment pattern.
22
+ *
23
+ * Mixing type logic: C += alpha * A * B
24
+ * | A | B |alpha| comments
25
+ * |real |cplx |cplx | no vectorization
26
+ * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
27
+ * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
28
+ * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
29
+ *
30
+ * Accesses to the matrix coefficients follow the following logic:
31
+ *
32
+ * - if all columns have the same alignment then
33
+ * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
34
+ * - otherwise perform unaligned loads only (-> NoneAligned case)
35
+ * - otherwise
36
+ * - if even columns have the same alignment then
37
+ * // odd columns are guaranteed to have the same alignment too
38
+ * - if even or odd columns have the same alignment as the result, then
39
+ * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
40
+ * - perform half aligned and half unaligned loads (-> EvenAligned case)
41
+ * - otherwise perform unaligned loads only (-> NoneAligned case)
42
+ * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
43
+ * - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
44
+ * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
45
+ * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
46
+ * - otherwise,
47
+ * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
48
+ * // we currently fall back to the NoneAligned case
49
+ *
50
+ * The same reasoning apply for the transposed case.
51
+ *
52
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
53
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
54
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
55
+ * compared to unaligned loads on a 4 byte boundary.
56
+ *
57
+ */
58
+ template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
59
+ struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
60
+ {
61
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
62
+
63
+ enum {
64
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
65
+ && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
66
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
67
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
68
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
69
+ };
70
+
71
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
72
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
73
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
74
+
75
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
76
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
77
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
78
+
79
+ EIGEN_DONT_INLINE static void run(
80
+ Index rows, Index cols,
81
+ const LhsMapper& lhs,
82
+ const RhsMapper& rhs,
83
+ ResScalar* res, Index resIncr,
84
+ RhsScalar alpha);
85
+ };
86
+
87
+ template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
88
+ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
89
+ Index rows, Index cols,
90
+ const LhsMapper& lhs,
91
+ const RhsMapper& rhs,
92
+ ResScalar* res, Index resIncr,
93
+ RhsScalar alpha)
94
+ {
95
+ EIGEN_UNUSED_VARIABLE(resIncr);
96
+ eigen_internal_assert(resIncr==1);
97
+ #ifdef _EIGEN_ACCUMULATE_PACKETS
98
+ #error _EIGEN_ACCUMULATE_PACKETS has already been defined
99
+ #endif
100
+ #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
101
+ pstore(&res[j], \
102
+ padd(pload<ResPacket>(&res[j]), \
103
+ padd( \
104
+ padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
105
+ pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
106
+ padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
107
+ pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
108
+
109
+ typedef typename LhsMapper::VectorMapper LhsScalars;
110
+
111
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
112
+ conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
113
+ if(ConjugateRhs)
114
+ alpha = numext::conj(alpha);
115
+
116
+ enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
117
+ const Index columnsAtOnce = 4;
118
+ const Index peels = 2;
119
+ const Index LhsPacketAlignedMask = LhsPacketSize-1;
120
+ const Index ResPacketAlignedMask = ResPacketSize-1;
121
+ // const Index PeelAlignedMask = ResPacketSize*peels-1;
122
+ const Index size = rows;
123
+
124
+ const Index lhsStride = lhs.stride();
125
+
126
+ // How many coeffs of the result do we have to skip to be aligned.
127
+ // Here we assume data are at least aligned on the base scalar type.
128
+ Index alignedStart = internal::first_default_aligned(res,size);
129
+ Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
130
+ const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
131
+
132
+ const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
133
+ Index alignmentPattern = alignmentStep==0 ? AllAligned
134
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
135
+ : FirstAligned;
136
+
137
+ // we cannot assume the first element is aligned because of sub-matrices
138
+ const Index lhsAlignmentOffset = lhs.firstAligned(size);
139
+
140
+ // find how many columns do we have to skip to be aligned with the result (if possible)
141
+ Index skipColumns = 0;
142
+ // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
143
+ if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
144
+ {
145
+ alignedSize = 0;
146
+ alignedStart = 0;
147
+ alignmentPattern = NoneAligned;
148
+ }
149
+ else if(LhsPacketSize > 4)
150
+ {
151
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
152
+ // Currently, it seems to be better to perform unaligned loads anyway
153
+ alignmentPattern = NoneAligned;
154
+ }
155
+ else if (LhsPacketSize>1)
156
+ {
157
+ // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
158
+
159
+ while (skipColumns<LhsPacketSize &&
160
+ alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
161
+ ++skipColumns;
162
+ if (skipColumns==LhsPacketSize)
163
+ {
164
+ // nothing can be aligned, no need to skip any column
165
+ alignmentPattern = NoneAligned;
166
+ skipColumns = 0;
167
+ }
168
+ else
169
+ {
170
+ skipColumns = (std::min)(skipColumns,cols);
171
+ // note that the skiped columns are processed later.
172
+ }
173
+
174
+ /* eigen_internal_assert( (alignmentPattern==NoneAligned)
175
+ || (skipColumns + columnsAtOnce >= cols)
176
+ || LhsPacketSize > size
177
+ || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
178
+ }
179
+ else if(Vectorizable)
180
+ {
181
+ alignedStart = 0;
182
+ alignedSize = size;
183
+ alignmentPattern = AllAligned;
184
+ }
185
+
186
+ const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
187
+ const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
188
+
189
+ Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
190
+ for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
191
+ {
192
+ RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
193
+ ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
194
+ ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
195
+ ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
196
+
197
+ // this helps a lot generating better binary code
198
+ const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
199
+ lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
200
+
201
+ if (Vectorizable)
202
+ {
203
+ /* explicit vectorization */
204
+ // process initial unaligned coeffs
205
+ for (Index j=0; j<alignedStart; ++j)
206
+ {
207
+ res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
208
+ res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
209
+ res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
210
+ res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
211
+ }
212
+
213
+ if (alignedSize>alignedStart)
214
+ {
215
+ switch(alignmentPattern)
216
+ {
217
+ case AllAligned:
218
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
219
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
220
+ break;
221
+ case EvenAligned:
222
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
223
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
224
+ break;
225
+ case FirstAligned:
226
+ {
227
+ Index j = alignedStart;
228
+ if(peels>1)
229
+ {
230
+ LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
231
+ ResPacket T0, T1;
232
+
233
+ A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
234
+ A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
235
+ A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
236
+
237
+ for (; j<peeledSize; j+=peels*ResPacketSize)
238
+ {
239
+ A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
240
+ A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
241
+ A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
242
+
243
+ A00 = lhs0.template load<LhsPacket, Aligned>(j);
244
+ A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
245
+ T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
246
+ T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
247
+
248
+ T0 = pcj.pmadd(A01, ptmp1, T0);
249
+ A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
250
+ T0 = pcj.pmadd(A02, ptmp2, T0);
251
+ A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
252
+ T0 = pcj.pmadd(A03, ptmp3, T0);
253
+ pstore(&res[j],T0);
254
+ A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
255
+ T1 = pcj.pmadd(A11, ptmp1, T1);
256
+ T1 = pcj.pmadd(A12, ptmp2, T1);
257
+ T1 = pcj.pmadd(A13, ptmp3, T1);
258
+ pstore(&res[j+ResPacketSize],T1);
259
+ }
260
+ }
261
+ for (; j<alignedSize; j+=ResPacketSize)
262
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
263
+ break;
264
+ }
265
+ default:
266
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
267
+ _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
268
+ break;
269
+ }
270
+ }
271
+ } // end explicit vectorization
272
+
273
+ /* process remaining coeffs (or all if there is no explicit vectorization) */
274
+ for (Index j=alignedSize; j<size; ++j)
275
+ {
276
+ res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
277
+ res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
278
+ res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
279
+ res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
280
+ }
281
+ }
282
+
283
+ // process remaining first and last columns (at most columnsAtOnce-1)
284
+ Index end = cols;
285
+ Index start = columnBound;
286
+ do
287
+ {
288
+ for (Index k=start; k<end; ++k)
289
+ {
290
+ RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
291
+ const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
292
+
293
+ if (Vectorizable)
294
+ {
295
+ /* explicit vectorization */
296
+ // process first unaligned result's coeffs
297
+ for (Index j=0; j<alignedStart; ++j)
298
+ res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
299
+ // process aligned result's coeffs
300
+ if (lhs0.template aligned<LhsPacket>(alignedStart))
301
+ for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
302
+ pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
303
+ else
304
+ for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
305
+ pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
306
+ }
307
+
308
+ // process remaining scalars (or all if no explicit vectorization)
309
+ for (Index i=alignedSize; i<size; ++i)
310
+ res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
311
+ }
312
+ if (skipColumns)
313
+ {
314
+ start = 0;
315
+ end = skipColumns;
316
+ skipColumns = 0;
317
+ }
318
+ else
319
+ break;
320
+ } while(Vectorizable);
321
+ #undef _EIGEN_ACCUMULATE_PACKETS
322
+ }
323
+
324
+ /* Optimized row-major matrix * vector product:
325
+ * This algorithm processes 4 rows at onces that allows to both reduce
326
+ * the number of load/stores of the result by a factor 4 and to reduce
327
+ * the instruction dependency. Moreover, we know that all bands have the
328
+ * same alignment pattern.
329
+ *
330
+ * Mixing type logic:
331
+ * - alpha is always a complex (or converted to a complex)
332
+ * - no vectorization
333
+ */
334
+ template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
335
+ struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
336
+ {
337
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
338
+
339
+ enum {
340
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
341
+ && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
342
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
343
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
344
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
345
+ };
346
+
347
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
348
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
349
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
350
+
351
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
352
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
353
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
354
+
355
+ EIGEN_DONT_INLINE static void run(
356
+ Index rows, Index cols,
357
+ const LhsMapper& lhs,
358
+ const RhsMapper& rhs,
359
+ ResScalar* res, Index resIncr,
360
+ ResScalar alpha);
361
+ };
362
+
363
+ template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
364
+ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
365
+ Index rows, Index cols,
366
+ const LhsMapper& lhs,
367
+ const RhsMapper& rhs,
368
+ ResScalar* res, Index resIncr,
369
+ ResScalar alpha)
370
+ {
371
+ eigen_internal_assert(rhs.stride()==1);
372
+
373
+ #ifdef _EIGEN_ACCUMULATE_PACKETS
374
+ #error _EIGEN_ACCUMULATE_PACKETS has already been defined
375
+ #endif
376
+
377
+ #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
378
+ RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
379
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
380
+ ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
381
+ ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
382
+ ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
383
+
384
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
385
+ conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
386
+
387
+ typedef typename LhsMapper::VectorMapper LhsScalars;
388
+
389
+ enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
390
+ const Index rowsAtOnce = 4;
391
+ const Index peels = 2;
392
+ const Index RhsPacketAlignedMask = RhsPacketSize-1;
393
+ const Index LhsPacketAlignedMask = LhsPacketSize-1;
394
+ const Index depth = cols;
395
+ const Index lhsStride = lhs.stride();
396
+
397
+ // How many coeffs of the result do we have to skip to be aligned.
398
+ // Here we assume data are at least aligned on the base scalar type
399
+ // if that's not the case then vectorization is discarded, see below.
400
+ Index alignedStart = rhs.firstAligned(depth);
401
+ Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
402
+ const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
403
+
404
+ const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
405
+ Index alignmentPattern = alignmentStep==0 ? AllAligned
406
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
407
+ : FirstAligned;
408
+
409
+ // we cannot assume the first element is aligned because of sub-matrices
410
+ const Index lhsAlignmentOffset = lhs.firstAligned(depth);
411
+ const Index rhsAlignmentOffset = rhs.firstAligned(rows);
412
+
413
+ // find how many rows do we have to skip to be aligned with rhs (if possible)
414
+ Index skipRows = 0;
415
+ // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
416
+ if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
417
+ (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
418
+ (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
419
+ {
420
+ alignedSize = 0;
421
+ alignedStart = 0;
422
+ alignmentPattern = NoneAligned;
423
+ }
424
+ else if(LhsPacketSize > 4)
425
+ {
426
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
427
+ alignmentPattern = NoneAligned;
428
+ }
429
+ else if (LhsPacketSize>1)
430
+ {
431
+ // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
432
+
433
+ while (skipRows<LhsPacketSize &&
434
+ alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
435
+ ++skipRows;
436
+ if (skipRows==LhsPacketSize)
437
+ {
438
+ // nothing can be aligned, no need to skip any column
439
+ alignmentPattern = NoneAligned;
440
+ skipRows = 0;
441
+ }
442
+ else
443
+ {
444
+ skipRows = (std::min)(skipRows,Index(rows));
445
+ // note that the skiped columns are processed later.
446
+ }
447
+ /* eigen_internal_assert( alignmentPattern==NoneAligned
448
+ || LhsPacketSize==1
449
+ || (skipRows + rowsAtOnce >= rows)
450
+ || LhsPacketSize > depth
451
+ || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
452
+ }
453
+ else if(Vectorizable)
454
+ {
455
+ alignedStart = 0;
456
+ alignedSize = depth;
457
+ alignmentPattern = AllAligned;
458
+ }
459
+
460
+ const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
461
+ const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
462
+
463
+ Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
464
+ for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
465
+ {
466
+ // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
467
+ EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
468
+ ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
469
+
470
+ // this helps the compiler generating good binary code
471
+ const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
472
+ lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
473
+
474
+ if (Vectorizable)
475
+ {
476
+ /* explicit vectorization */
477
+ ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
478
+ ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
479
+
480
+ // process initial unaligned coeffs
481
+ // FIXME this loop get vectorized by the compiler !
482
+ for (Index j=0; j<alignedStart; ++j)
483
+ {
484
+ RhsScalar b = rhs(j, 0);
485
+ tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
486
+ tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
487
+ }
488
+
489
+ if (alignedSize>alignedStart)
490
+ {
491
+ switch(alignmentPattern)
492
+ {
493
+ case AllAligned:
494
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
495
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
496
+ break;
497
+ case EvenAligned:
498
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
499
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
500
+ break;
501
+ case FirstAligned:
502
+ {
503
+ Index j = alignedStart;
504
+ if (peels>1)
505
+ {
506
+ /* Here we proccess 4 rows with with two peeled iterations to hide
507
+ * the overhead of unaligned loads. Moreover unaligned loads are handled
508
+ * using special shift/move operations between the two aligned packets
509
+ * overlaping the desired unaligned packet. This is *much* more efficient
510
+ * than basic unaligned loads.
511
+ */
512
+ LhsPacket A01, A02, A03, A11, A12, A13;
513
+ A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
514
+ A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
515
+ A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
516
+
517
+ for (; j<peeledSize; j+=peels*RhsPacketSize)
518
+ {
519
+ RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
520
+ A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
521
+ A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
522
+ A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
523
+
524
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
525
+ ptmp1 = pcj.pmadd(A01, b, ptmp1);
526
+ A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
527
+ ptmp2 = pcj.pmadd(A02, b, ptmp2);
528
+ A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
529
+ ptmp3 = pcj.pmadd(A03, b, ptmp3);
530
+ A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
531
+
532
+ b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
533
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
534
+ ptmp1 = pcj.pmadd(A11, b, ptmp1);
535
+ ptmp2 = pcj.pmadd(A12, b, ptmp2);
536
+ ptmp3 = pcj.pmadd(A13, b, ptmp3);
537
+ }
538
+ }
539
+ for (; j<alignedSize; j+=RhsPacketSize)
540
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
541
+ break;
542
+ }
543
+ default:
544
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
545
+ _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
546
+ break;
547
+ }
548
+ tmp0 += predux(ptmp0);
549
+ tmp1 += predux(ptmp1);
550
+ tmp2 += predux(ptmp2);
551
+ tmp3 += predux(ptmp3);
552
+ }
553
+ } // end explicit vectorization
554
+
555
+ // process remaining coeffs (or all if no explicit vectorization)
556
+ // FIXME this loop get vectorized by the compiler !
557
+ for (Index j=alignedSize; j<depth; ++j)
558
+ {
559
+ RhsScalar b = rhs(j, 0);
560
+ tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
561
+ tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
562
+ }
563
+ res[i*resIncr] += alpha*tmp0;
564
+ res[(i+offset1)*resIncr] += alpha*tmp1;
565
+ res[(i+2)*resIncr] += alpha*tmp2;
566
+ res[(i+offset3)*resIncr] += alpha*tmp3;
567
+ }
568
+
569
+ // process remaining first and last rows (at most columnsAtOnce-1)
570
+ Index end = rows;
571
+ Index start = rowBound;
572
+ do
573
+ {
574
+ for (Index i=start; i<end; ++i)
575
+ {
576
+ EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
577
+ ResPacket ptmp0 = pset1<ResPacket>(tmp0);
578
+ const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
579
+ // process first unaligned result's coeffs
580
+ // FIXME this loop get vectorized by the compiler !
581
+ for (Index j=0; j<alignedStart; ++j)
582
+ tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
583
+
584
+ if (alignedSize>alignedStart)
585
+ {
586
+ // process aligned rhs coeffs
587
+ if (lhs0.template aligned<LhsPacket>(alignedStart))
588
+ for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
589
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
590
+ else
591
+ for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
592
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
593
+ tmp0 += predux(ptmp0);
594
+ }
595
+
596
+ // process remaining scalars
597
+ // FIXME this loop get vectorized by the compiler !
598
+ for (Index j=alignedSize; j<depth; ++j)
599
+ tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
600
+ res[i*resIncr] += alpha*tmp0;
601
+ }
602
+ if (skipRows)
603
+ {
604
+ start = 0;
605
+ end = skipRows;
606
+ skipRows = 0;
607
+ }
608
+ else
609
+ break;
610
+ } while(Vectorizable);
611
+
612
+ #undef _EIGEN_ACCUMULATE_PACKETS
613
+ }
614
+
615
+ } // end namespace internal
616
+
617
+ } // end namespace Eigen
618
+
619
+ #endif // EIGEN_GENERAL_MATRIX_VECTOR_H