tomoto 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,2156 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11
+ #define EIGEN_GENERAL_BLOCK_PANEL_H
12
+
13
+
14
+ namespace Eigen {
15
+
16
+ namespace internal {
17
+
18
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
19
+ class gebp_traits;
20
+
21
+
22
+ /** \internal \returns b if a<=0, and returns a otherwise. */
23
+ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
24
+ {
25
+ return a<=0 ? b : a;
26
+ }
27
+
28
+ #if EIGEN_ARCH_i386_OR_x86_64
29
+ const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30
+ const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31
+ const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
32
+ #else
33
+ const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34
+ const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35
+ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
36
+ #endif
37
+
38
+ /** \internal */
39
+ struct CacheSizes {
40
+ CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
41
+ int l1CacheSize, l2CacheSize, l3CacheSize;
42
+ queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
43
+ m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
44
+ m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
45
+ m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
46
+ }
47
+
48
+ std::ptrdiff_t m_l1;
49
+ std::ptrdiff_t m_l2;
50
+ std::ptrdiff_t m_l3;
51
+ };
52
+
53
+
54
+ /** \internal */
55
+ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56
+ {
57
+ static CacheSizes m_cacheSizes;
58
+
59
+ if(action==SetAction)
60
+ {
61
+ // set the cpu cache size and cache all block sizes from a global cache size in byte
62
+ eigen_internal_assert(l1!=0 && l2!=0);
63
+ m_cacheSizes.m_l1 = *l1;
64
+ m_cacheSizes.m_l2 = *l2;
65
+ m_cacheSizes.m_l3 = *l3;
66
+ }
67
+ else if(action==GetAction)
68
+ {
69
+ eigen_internal_assert(l1!=0 && l2!=0);
70
+ *l1 = m_cacheSizes.m_l1;
71
+ *l2 = m_cacheSizes.m_l2;
72
+ *l3 = m_cacheSizes.m_l3;
73
+ }
74
+ else
75
+ {
76
+ eigen_internal_assert(false);
77
+ }
78
+ }
79
+
80
+ /* Helper for computeProductBlockingSizes.
81
+ *
82
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
83
+ * this function computes the blocking size parameters along the respective dimensions
84
+ * for matrix products and related algorithms. The blocking sizes depends on various
85
+ * parameters:
86
+ * - the L1 and L2 cache sizes,
87
+ * - the register level blocking sizes defined by gebp_traits,
88
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
89
+ *
90
+ * \sa setCpuCacheSizes */
91
+
92
+ template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
93
+ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
94
+ {
95
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
96
+
97
+ // Explanations:
98
+ // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
99
+ // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
100
+ // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
101
+ // at the register level. This small horizontal panel has to stay within L1 cache.
102
+ std::ptrdiff_t l1, l2, l3;
103
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
104
+
105
+ if (num_threads > 1) {
106
+ typedef typename Traits::ResScalar ResScalar;
107
+ enum {
108
+ kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
109
+ ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
110
+ kr = 8,
111
+ mr = Traits::mr,
112
+ nr = Traits::nr
113
+ };
114
+ // Increasing k gives us more time to prefetch the content of the "C"
115
+ // registers. However once the latency is hidden there is no point in
116
+ // increasing the value of k, so we'll cap it at 320 (value determined
117
+ // experimentally).
118
+ const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
119
+ if (k_cache < k) {
120
+ k = k_cache - (k_cache % kr);
121
+ eigen_internal_assert(k > 0);
122
+ }
123
+
124
+ const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
125
+ const Index n_per_thread = numext::div_ceil(n, num_threads);
126
+ if (n_cache <= n_per_thread) {
127
+ // Don't exceed the capacity of the l2 cache.
128
+ eigen_internal_assert(n_cache >= static_cast<Index>(nr));
129
+ n = n_cache - (n_cache % nr);
130
+ eigen_internal_assert(n > 0);
131
+ } else {
132
+ n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
133
+ }
134
+
135
+ if (l3 > l2) {
136
+ // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
137
+ const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
138
+ const Index m_per_thread = numext::div_ceil(m, num_threads);
139
+ if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140
+ m = m_cache - (m_cache % mr);
141
+ eigen_internal_assert(m > 0);
142
+ } else {
143
+ m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
144
+ }
145
+ }
146
+ }
147
+ else {
148
+ // In unit tests we do not want to use extra large matrices,
149
+ // so we reduce the cache size to check the blocking strategy is not flawed
150
+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
151
+ l1 = 9*1024;
152
+ l2 = 32*1024;
153
+ l3 = 512*1024;
154
+ #endif
155
+
156
+ // Early return for small problems because the computation below are time consuming for small problems.
157
+ // Perhaps it would make more sense to consider k*n*m??
158
+ // Note that for very tiny problem, this function should be bypassed anyway
159
+ // because we use the coefficient-based implementation for them.
160
+ if((numext::maxi)(k,(numext::maxi)(m,n))<48)
161
+ return;
162
+
163
+ typedef typename Traits::ResScalar ResScalar;
164
+ enum {
165
+ k_peeling = 8,
166
+ k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
167
+ k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
168
+ };
169
+
170
+ // ---- 1st level of blocking on L1, yields kc ----
171
+
172
+ // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
173
+ // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
174
+ // We also include a register-level block of the result (mx x nr).
175
+ // (In an ideal world only the lhs panel would stay in L1)
176
+ // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
177
+ const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178
+ const Index old_k = k;
179
+ if(k>max_kc)
180
+ {
181
+ // We are really blocking on the third dimension:
182
+ // -> reduce blocking size to make sure the last block is as large as possible
183
+ // while keeping the same number of sweeps over the result.
184
+ k = (k%max_kc)==0 ? max_kc
185
+ : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
186
+
187
+ eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
188
+ }
189
+
190
+ // ---- 2nd level of blocking on max(L2,L3), yields nc ----
191
+
192
+ // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
193
+ // actual_l2 = max(l2, l3/nb_core_sharing_l3)
194
+ // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
195
+ // For instance, it corresponds to 6MB of L3 shared among 4 cores.
196
+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
197
+ const Index actual_l2 = l3;
198
+ #else
199
+ const Index actual_l2 = 1572864; // == 1.5 MB
200
+ #endif
201
+
202
+ // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
203
+ // The second half is implicitly reserved to access the result and lhs coefficients.
204
+ // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
205
+ // to limit this growth: we bound nc to growth by a factor x1.5.
206
+ // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
207
+ // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
208
+ Index max_nc;
209
+ const Index lhs_bytes = m * k * sizeof(LhsScalar);
210
+ const Index remaining_l1 = l1- k_sub - lhs_bytes;
211
+ if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
212
+ {
213
+ // L1 blocking
214
+ max_nc = remaining_l1 / (k*sizeof(RhsScalar));
215
+ }
216
+ else
217
+ {
218
+ // L2 blocking
219
+ max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
220
+ }
221
+ // WARNING Below, we assume that Traits::nr is a power of two.
222
+ Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
223
+ if(n>nc)
224
+ {
225
+ // We are really blocking over the columns:
226
+ // -> reduce blocking size to make sure the last block is as large as possible
227
+ // while keeping the same number of sweeps over the packed lhs.
228
+ // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
229
+ n = (n%nc)==0 ? nc
230
+ : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
231
+ }
232
+ else if(old_k==k)
233
+ {
234
+ // So far, no blocking at all, i.e., kc==k, and nc==n.
235
+ // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
236
+ // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
237
+ Index problem_size = k*n*sizeof(LhsScalar);
238
+ Index actual_lm = actual_l2;
239
+ Index max_mc = m;
240
+ if(problem_size<=1024)
241
+ {
242
+ // problem is small enough to keep in L1
243
+ // Let's choose m such that lhs's block fit in 1/3 of L1
244
+ actual_lm = l1;
245
+ }
246
+ else if(l3!=0 && problem_size<=32768)
247
+ {
248
+ // we have both L2 and L3, and problem is small enough to be kept in L2
249
+ // Let's choose m such that lhs's block fit in 1/3 of L2
250
+ actual_lm = l2;
251
+ max_mc = (numext::mini<Index>)(576,max_mc);
252
+ }
253
+ Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
254
+ if (mc > Traits::mr) mc -= mc % Traits::mr;
255
+ else if (mc==0) return;
256
+ m = (m%mc)==0 ? mc
257
+ : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
258
+ }
259
+ }
260
+ }
261
+
262
+ template <typename Index>
263
+ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
264
+ {
265
+ #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
266
+ if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
267
+ k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268
+ m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269
+ n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
270
+ return true;
271
+ }
272
+ #else
273
+ EIGEN_UNUSED_VARIABLE(k)
274
+ EIGEN_UNUSED_VARIABLE(m)
275
+ EIGEN_UNUSED_VARIABLE(n)
276
+ #endif
277
+ return false;
278
+ }
279
+
280
+ /** \brief Computes the blocking parameters for a m x k times k x n matrix product
281
+ *
282
+ * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
283
+ * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
284
+ * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
285
+ *
286
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
287
+ * this function computes the blocking size parameters along the respective dimensions
288
+ * for matrix products and related algorithms.
289
+ *
290
+ * The blocking size parameters may be evaluated:
291
+ * - either by a heuristic based on cache sizes;
292
+ * - or using fixed prescribed values (for testing purposes).
293
+ *
294
+ * \sa setCpuCacheSizes */
295
+
296
+ template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
297
+ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
298
+ {
299
+ if (!useSpecificBlockingSizes(k, m, n)) {
300
+ evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
301
+ }
302
+ }
303
+
304
+ template<typename LhsScalar, typename RhsScalar, typename Index>
305
+ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
306
+ {
307
+ computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
308
+ }
309
+
310
+ #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
311
+ #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
312
+ #else
313
+
314
+ // FIXME (a bit overkill maybe ?)
315
+
316
+ template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
317
+ EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
318
+ {
319
+ c = cj.pmadd(a,b,c);
320
+ }
321
+ };
322
+
323
+ template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
324
+ EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
325
+ {
326
+ t = b; t = cj.pmul(a,t); c = padd(c,t);
327
+ }
328
+ };
329
+
330
+ template<typename CJ, typename A, typename B, typename C, typename T>
331
+ EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
332
+ {
333
+ gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
334
+ }
335
+
336
+ #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
337
+ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
338
+ #endif
339
+
340
+ /* Vectorization logic
341
+ * real*real: unpack rhs to constant packets, ...
342
+ *
343
+ * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
344
+ * storing each res packet into two packets (2x2),
345
+ * at the end combine them: swap the second and addsub them
346
+ * cf*cf : same but with 2x4 blocks
347
+ * cplx*real : unpack rhs to constant packets, ...
348
+ * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
349
+ */
350
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
351
+ class gebp_traits
352
+ {
353
+ public:
354
+ typedef _LhsScalar LhsScalar;
355
+ typedef _RhsScalar RhsScalar;
356
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
357
+
358
+ enum {
359
+ ConjLhs = _ConjLhs,
360
+ ConjRhs = _ConjRhs,
361
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
362
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
363
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
364
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
365
+
366
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
367
+
368
+ // register block size along the N direction must be 1 or 4
369
+ nr = 4,
370
+
371
+ // register block size along the M direction (currently, this one cannot be modified)
372
+ default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
373
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
374
+ // we assume 16 registers
375
+ // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
376
+ // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
377
+ mr = Vectorizable ? 3*LhsPacketSize : default_mr,
378
+ #else
379
+ mr = default_mr,
380
+ #endif
381
+
382
+ LhsProgress = LhsPacketSize,
383
+ RhsProgress = 1
384
+ };
385
+
386
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
387
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
388
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
389
+
390
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
391
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
392
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
393
+
394
+ typedef ResPacket AccPacket;
395
+
396
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
397
+ {
398
+ p = pset1<ResPacket>(ResScalar(0));
399
+ }
400
+
401
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
402
+ {
403
+ pbroadcast4(b, b0, b1, b2, b3);
404
+ }
405
+
406
+ // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
407
+ // {
408
+ // pbroadcast2(b, b0, b1);
409
+ // }
410
+
411
+ template<typename RhsPacketType>
412
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
413
+ {
414
+ dest = pset1<RhsPacketType>(*b);
415
+ }
416
+
417
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
418
+ {
419
+ dest = ploadquad<RhsPacket>(b);
420
+ }
421
+
422
+ template<typename LhsPacketType>
423
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
424
+ {
425
+ dest = pload<LhsPacketType>(a);
426
+ }
427
+
428
+ template<typename LhsPacketType>
429
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
430
+ {
431
+ dest = ploadu<LhsPacketType>(a);
432
+ }
433
+
434
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
435
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
436
+ {
437
+ conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
438
+ // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
439
+ // let gcc allocate the register in which to store the result of the pmul
440
+ // (in the case where there is no FMA) gcc fails to figure out how to avoid
441
+ // spilling register.
442
+ #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
443
+ EIGEN_UNUSED_VARIABLE(tmp);
444
+ c = cj.pmadd(a,b,c);
445
+ #else
446
+ tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
447
+ #endif
448
+ }
449
+
450
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
451
+ {
452
+ r = pmadd(c,alpha,r);
453
+ }
454
+
455
+ template<typename ResPacketHalf>
456
+ EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
457
+ {
458
+ r = pmadd(c,alpha,r);
459
+ }
460
+
461
+ };
462
+
463
+ template<typename RealScalar, bool _ConjLhs>
464
+ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
465
+ {
466
+ public:
467
+ typedef std::complex<RealScalar> LhsScalar;
468
+ typedef RealScalar RhsScalar;
469
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
470
+
471
+ enum {
472
+ ConjLhs = _ConjLhs,
473
+ ConjRhs = false,
474
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
475
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
476
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
477
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
478
+
479
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
480
+ nr = 4,
481
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
482
+ // we assume 16 registers
483
+ mr = 3*LhsPacketSize,
484
+ #else
485
+ mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
486
+ #endif
487
+
488
+ LhsProgress = LhsPacketSize,
489
+ RhsProgress = 1
490
+ };
491
+
492
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
493
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
494
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
495
+
496
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
497
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
498
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
499
+
500
+ typedef ResPacket AccPacket;
501
+
502
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
503
+ {
504
+ p = pset1<ResPacket>(ResScalar(0));
505
+ }
506
+
507
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
508
+ {
509
+ dest = pset1<RhsPacket>(*b);
510
+ }
511
+
512
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
513
+ {
514
+ dest = pset1<RhsPacket>(*b);
515
+ }
516
+
517
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
518
+ {
519
+ dest = pload<LhsPacket>(a);
520
+ }
521
+
522
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
523
+ {
524
+ dest = ploadu<LhsPacket>(a);
525
+ }
526
+
527
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
528
+ {
529
+ pbroadcast4(b, b0, b1, b2, b3);
530
+ }
531
+
532
+ // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
533
+ // {
534
+ // pbroadcast2(b, b0, b1);
535
+ // }
536
+
537
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
538
+ {
539
+ madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
540
+ }
541
+
542
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
543
+ {
544
+ #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
545
+ EIGEN_UNUSED_VARIABLE(tmp);
546
+ c.v = pmadd(a.v,b,c.v);
547
+ #else
548
+ tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
549
+ #endif
550
+ }
551
+
552
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
553
+ {
554
+ c += a * b;
555
+ }
556
+
557
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
558
+ {
559
+ r = cj.pmadd(c,alpha,r);
560
+ }
561
+
562
+ protected:
563
+ conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
564
+ };
565
+
566
+ template<typename Packet>
567
+ struct DoublePacket
568
+ {
569
+ Packet first;
570
+ Packet second;
571
+ };
572
+
573
+ template<typename Packet>
574
+ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
575
+ {
576
+ DoublePacket<Packet> res;
577
+ res.first = padd(a.first, b.first);
578
+ res.second = padd(a.second,b.second);
579
+ return res;
580
+ }
581
+
582
+ template<typename Packet>
583
+ const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
584
+ {
585
+ return a;
586
+ }
587
+
588
+ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
589
+ // template<typename Packet>
590
+ // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
591
+ // {
592
+ // DoublePacket<Packet> res;
593
+ // res.first = padd(a.first, b.first);
594
+ // res.second = padd(a.second,b.second);
595
+ // return res;
596
+ // }
597
+
598
+ template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
599
+ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
600
+ {
601
+ public:
602
+ typedef std::complex<RealScalar> Scalar;
603
+ typedef std::complex<RealScalar> LhsScalar;
604
+ typedef std::complex<RealScalar> RhsScalar;
605
+ typedef std::complex<RealScalar> ResScalar;
606
+
607
+ enum {
608
+ ConjLhs = _ConjLhs,
609
+ ConjRhs = _ConjRhs,
610
+ Vectorizable = packet_traits<RealScalar>::Vectorizable
611
+ && packet_traits<Scalar>::Vectorizable,
612
+ RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
613
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
614
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
615
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
616
+
617
+ // FIXME: should depend on NumberOfRegisters
618
+ nr = 4,
619
+ mr = ResPacketSize,
620
+
621
+ LhsProgress = ResPacketSize,
622
+ RhsProgress = 1
623
+ };
624
+
625
+ typedef typename packet_traits<RealScalar>::type RealPacket;
626
+ typedef typename packet_traits<Scalar>::type ScalarPacket;
627
+ typedef DoublePacket<RealPacket> DoublePacketType;
628
+
629
+ typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
630
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
631
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
632
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
633
+
634
+ EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
635
+
636
+ EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
637
+ {
638
+ p.first = pset1<RealPacket>(RealScalar(0));
639
+ p.second = pset1<RealPacket>(RealScalar(0));
640
+ }
641
+
642
+ // Scalar path
643
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
644
+ {
645
+ dest = pset1<ResPacket>(*b);
646
+ }
647
+
648
+ // Vectorized path
649
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
650
+ {
651
+ dest.first = pset1<RealPacket>(real(*b));
652
+ dest.second = pset1<RealPacket>(imag(*b));
653
+ }
654
+
655
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
656
+ {
657
+ loadRhs(b,dest);
658
+ }
659
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
660
+ {
661
+ eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
662
+ loadRhs(b,dest);
663
+ }
664
+
665
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
666
+ {
667
+ // FIXME not sure that's the best way to implement it!
668
+ loadRhs(b+0, b0);
669
+ loadRhs(b+1, b1);
670
+ loadRhs(b+2, b2);
671
+ loadRhs(b+3, b3);
672
+ }
673
+
674
+ // Vectorized path
675
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
676
+ {
677
+ // FIXME not sure that's the best way to implement it!
678
+ loadRhs(b+0, b0);
679
+ loadRhs(b+1, b1);
680
+ }
681
+
682
+ // Scalar path
683
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
684
+ {
685
+ // FIXME not sure that's the best way to implement it!
686
+ loadRhs(b+0, b0);
687
+ loadRhs(b+1, b1);
688
+ }
689
+
690
+ // nothing special here
691
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
692
+ {
693
+ dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
694
+ }
695
+
696
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
697
+ {
698
+ dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
699
+ }
700
+
701
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
702
+ {
703
+ c.first = padd(pmul(a,b.first), c.first);
704
+ c.second = padd(pmul(a,b.second),c.second);
705
+ }
706
+
707
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
708
+ {
709
+ c = cj.pmadd(a,b,c);
710
+ }
711
+
712
+ EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
713
+
714
+ EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
715
+ {
716
+ // assemble c
717
+ ResPacket tmp;
718
+ if((!ConjLhs)&&(!ConjRhs))
719
+ {
720
+ tmp = pcplxflip(pconj(ResPacket(c.second)));
721
+ tmp = padd(ResPacket(c.first),tmp);
722
+ }
723
+ else if((!ConjLhs)&&(ConjRhs))
724
+ {
725
+ tmp = pconj(pcplxflip(ResPacket(c.second)));
726
+ tmp = padd(ResPacket(c.first),tmp);
727
+ }
728
+ else if((ConjLhs)&&(!ConjRhs))
729
+ {
730
+ tmp = pcplxflip(ResPacket(c.second));
731
+ tmp = padd(pconj(ResPacket(c.first)),tmp);
732
+ }
733
+ else if((ConjLhs)&&(ConjRhs))
734
+ {
735
+ tmp = pcplxflip(ResPacket(c.second));
736
+ tmp = psub(pconj(ResPacket(c.first)),tmp);
737
+ }
738
+
739
+ r = pmadd(tmp,alpha,r);
740
+ }
741
+
742
+ protected:
743
+ conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
744
+ };
745
+
746
+ template<typename RealScalar, bool _ConjRhs>
747
+ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
748
+ {
749
+ public:
750
+ typedef std::complex<RealScalar> Scalar;
751
+ typedef RealScalar LhsScalar;
752
+ typedef Scalar RhsScalar;
753
+ typedef Scalar ResScalar;
754
+
755
+ enum {
756
+ ConjLhs = false,
757
+ ConjRhs = _ConjRhs,
758
+ Vectorizable = packet_traits<RealScalar>::Vectorizable
759
+ && packet_traits<Scalar>::Vectorizable,
760
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
761
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
762
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
763
+
764
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
765
+ // FIXME: should depend on NumberOfRegisters
766
+ nr = 4,
767
+ mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
768
+
769
+ LhsProgress = ResPacketSize,
770
+ RhsProgress = 1
771
+ };
772
+
773
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
774
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
775
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
776
+
777
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
778
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
779
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
780
+
781
+ typedef ResPacket AccPacket;
782
+
783
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
784
+ {
785
+ p = pset1<ResPacket>(ResScalar(0));
786
+ }
787
+
788
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
789
+ {
790
+ dest = pset1<RhsPacket>(*b);
791
+ }
792
+
793
+ void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
794
+ {
795
+ pbroadcast4(b, b0, b1, b2, b3);
796
+ }
797
+
798
+ // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
799
+ // {
800
+ // // FIXME not sure that's the best way to implement it!
801
+ // b0 = pload1<RhsPacket>(b+0);
802
+ // b1 = pload1<RhsPacket>(b+1);
803
+ // }
804
+
805
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
806
+ {
807
+ dest = ploaddup<LhsPacket>(a);
808
+ }
809
+
810
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
811
+ {
812
+ eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
813
+ loadRhs(b,dest);
814
+ }
815
+
816
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
817
+ {
818
+ dest = ploaddup<LhsPacket>(a);
819
+ }
820
+
821
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
822
+ {
823
+ madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
824
+ }
825
+
826
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
827
+ {
828
+ #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
829
+ EIGEN_UNUSED_VARIABLE(tmp);
830
+ c.v = pmadd(a,b.v,c.v);
831
+ #else
832
+ tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
833
+ #endif
834
+
835
+ }
836
+
837
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
838
+ {
839
+ c += a * b;
840
+ }
841
+
842
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
843
+ {
844
+ r = cj.pmadd(alpha,c,r);
845
+ }
846
+
847
+ protected:
848
+ conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
849
+ };
850
+
851
+ /* optimized GEneral packed Block * packed Panel product kernel
852
+ *
853
+ * Mixing type logic: C += A * B
854
+ * | A | B | comments
855
+ * |real |cplx | no vectorization yet, would require to pack A with duplication
856
+ * |cplx |real | easy vectorization
857
+ */
858
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
859
+ struct gebp_kernel
860
+ {
861
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
862
+ typedef typename Traits::ResScalar ResScalar;
863
+ typedef typename Traits::LhsPacket LhsPacket;
864
+ typedef typename Traits::RhsPacket RhsPacket;
865
+ typedef typename Traits::ResPacket ResPacket;
866
+ typedef typename Traits::AccPacket AccPacket;
867
+
868
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
869
+ typedef typename SwappedTraits::ResScalar SResScalar;
870
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
871
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
872
+ typedef typename SwappedTraits::ResPacket SResPacket;
873
+ typedef typename SwappedTraits::AccPacket SAccPacket;
874
+
875
+ typedef typename DataMapper::LinearMapper LinearMapper;
876
+
877
+ enum {
878
+ Vectorizable = Traits::Vectorizable,
879
+ LhsProgress = Traits::LhsProgress,
880
+ RhsProgress = Traits::RhsProgress,
881
+ ResPacketSize = Traits::ResPacketSize
882
+ };
883
+
884
+ EIGEN_DONT_INLINE
885
+ void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
886
+ Index rows, Index depth, Index cols, ResScalar alpha,
887
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
888
+ };
889
+
890
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
891
+ EIGEN_DONT_INLINE
892
+ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
893
+ ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
894
+ Index rows, Index depth, Index cols, ResScalar alpha,
895
+ Index strideA, Index strideB, Index offsetA, Index offsetB)
896
+ {
897
+ Traits traits;
898
+ SwappedTraits straits;
899
+
900
+ if(strideA==-1) strideA = depth;
901
+ if(strideB==-1) strideB = depth;
902
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
903
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904
+ const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905
+ const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
907
+ enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
908
+ const Index peeled_kc = depth & ~(pk-1);
909
+ const Index prefetch_res_offset = 32/sizeof(ResScalar);
910
+ // const Index depth2 = depth & ~1;
911
+
912
+ //---------- Process 3 * LhsProgress rows at once ----------
913
+ // This corresponds to 3*LhsProgress x nr register blocks.
914
+ // Usually, make sense only with FMA
915
+ if(mr>=3*Traits::LhsProgress)
916
+ {
917
+ // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
918
+ // and on each largest micro vertical panel of the rhs (depth * nr).
919
+ // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
920
+ // However, if depth is too small, we can extend the number of rows of these horizontal panels.
921
+ // This actual number of rows is computed as follow:
922
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
923
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
924
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
925
+ // or because we are testing specific blocking sizes.
926
+ const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
927
+ for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
928
+ {
929
+ const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
930
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
931
+ {
932
+ for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
933
+ {
934
+
935
+ // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
936
+ // stored into 3 x nr registers.
937
+
938
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
939
+ prefetch(&blA[0]);
940
+
941
+ // gets res block as register
942
+ AccPacket C0, C1, C2, C3,
943
+ C4, C5, C6, C7,
944
+ C8, C9, C10, C11;
945
+ traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
946
+ traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
947
+ traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
948
+
949
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
950
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
951
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
952
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
953
+
954
+ r0.prefetch(0);
955
+ r1.prefetch(0);
956
+ r2.prefetch(0);
957
+ r3.prefetch(0);
958
+
959
+ // performs "inner" products
960
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
961
+ prefetch(&blB[0]);
962
+ LhsPacket A0, A1;
963
+
964
+ for(Index k=0; k<peeled_kc; k+=pk)
965
+ {
966
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
967
+ RhsPacket B_0, T0;
968
+ LhsPacket A2;
969
+
970
+ #define EIGEN_GEBP_ONESTEP(K) \
971
+ do { \
972
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974
+ internal::prefetch(blA+(3*K+16)*LhsProgress); \
975
+ if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
976
+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977
+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978
+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979
+ traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980
+ traits.madd(A0, B_0, C0, T0); \
981
+ traits.madd(A1, B_0, C4, T0); \
982
+ traits.madd(A2, B_0, C8, B_0); \
983
+ traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984
+ traits.madd(A0, B_0, C1, T0); \
985
+ traits.madd(A1, B_0, C5, T0); \
986
+ traits.madd(A2, B_0, C9, B_0); \
987
+ traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988
+ traits.madd(A0, B_0, C2, T0); \
989
+ traits.madd(A1, B_0, C6, T0); \
990
+ traits.madd(A2, B_0, C10, B_0); \
991
+ traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992
+ traits.madd(A0, B_0, C3 , T0); \
993
+ traits.madd(A1, B_0, C7, T0); \
994
+ traits.madd(A2, B_0, C11, B_0); \
995
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
996
+ } while(false)
997
+
998
+ internal::prefetch(blB);
999
+ EIGEN_GEBP_ONESTEP(0);
1000
+ EIGEN_GEBP_ONESTEP(1);
1001
+ EIGEN_GEBP_ONESTEP(2);
1002
+ EIGEN_GEBP_ONESTEP(3);
1003
+ EIGEN_GEBP_ONESTEP(4);
1004
+ EIGEN_GEBP_ONESTEP(5);
1005
+ EIGEN_GEBP_ONESTEP(6);
1006
+ EIGEN_GEBP_ONESTEP(7);
1007
+
1008
+ blB += pk*4*RhsProgress;
1009
+ blA += pk*3*Traits::LhsProgress;
1010
+
1011
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1012
+ }
1013
+ // process remaining peeled loop
1014
+ for(Index k=peeled_kc; k<depth; k++)
1015
+ {
1016
+ RhsPacket B_0, T0;
1017
+ LhsPacket A2;
1018
+ EIGEN_GEBP_ONESTEP(0);
1019
+ blB += 4*RhsProgress;
1020
+ blA += 3*Traits::LhsProgress;
1021
+ }
1022
+
1023
+ #undef EIGEN_GEBP_ONESTEP
1024
+
1025
+ ResPacket R0, R1, R2;
1026
+ ResPacket alphav = pset1<ResPacket>(alpha);
1027
+
1028
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030
+ R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031
+ traits.acc(C0, alphav, R0);
1032
+ traits.acc(C4, alphav, R1);
1033
+ traits.acc(C8, alphav, R2);
1034
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1035
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
1036
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
1037
+
1038
+ R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039
+ R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040
+ R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041
+ traits.acc(C1, alphav, R0);
1042
+ traits.acc(C5, alphav, R1);
1043
+ traits.acc(C9, alphav, R2);
1044
+ r1.storePacket(0 * Traits::ResPacketSize, R0);
1045
+ r1.storePacket(1 * Traits::ResPacketSize, R1);
1046
+ r1.storePacket(2 * Traits::ResPacketSize, R2);
1047
+
1048
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049
+ R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050
+ R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051
+ traits.acc(C2, alphav, R0);
1052
+ traits.acc(C6, alphav, R1);
1053
+ traits.acc(C10, alphav, R2);
1054
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
1055
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
1056
+ r2.storePacket(2 * Traits::ResPacketSize, R2);
1057
+
1058
+ R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059
+ R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060
+ R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061
+ traits.acc(C3, alphav, R0);
1062
+ traits.acc(C7, alphav, R1);
1063
+ traits.acc(C11, alphav, R2);
1064
+ r3.storePacket(0 * Traits::ResPacketSize, R0);
1065
+ r3.storePacket(1 * Traits::ResPacketSize, R1);
1066
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
1067
+ }
1068
+ }
1069
+
1070
+ // Deal with remaining columns of the rhs
1071
+ for(Index j2=packet_cols4; j2<cols; j2++)
1072
+ {
1073
+ for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1074
+ {
1075
+ // One column at a time
1076
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1077
+ prefetch(&blA[0]);
1078
+
1079
+ // gets res block as register
1080
+ AccPacket C0, C4, C8;
1081
+ traits.initAcc(C0);
1082
+ traits.initAcc(C4);
1083
+ traits.initAcc(C8);
1084
+
1085
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1086
+ r0.prefetch(0);
1087
+
1088
+ // performs "inner" products
1089
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1090
+ LhsPacket A0, A1, A2;
1091
+
1092
+ for(Index k=0; k<peeled_kc; k+=pk)
1093
+ {
1094
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1095
+ RhsPacket B_0;
1096
+ #define EIGEN_GEBGP_ONESTEP(K) \
1097
+ do { \
1098
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100
+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101
+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102
+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104
+ traits.madd(A0, B_0, C0, B_0); \
1105
+ traits.madd(A1, B_0, C4, B_0); \
1106
+ traits.madd(A2, B_0, C8, B_0); \
1107
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1108
+ } while(false)
1109
+
1110
+ EIGEN_GEBGP_ONESTEP(0);
1111
+ EIGEN_GEBGP_ONESTEP(1);
1112
+ EIGEN_GEBGP_ONESTEP(2);
1113
+ EIGEN_GEBGP_ONESTEP(3);
1114
+ EIGEN_GEBGP_ONESTEP(4);
1115
+ EIGEN_GEBGP_ONESTEP(5);
1116
+ EIGEN_GEBGP_ONESTEP(6);
1117
+ EIGEN_GEBGP_ONESTEP(7);
1118
+
1119
+ blB += pk*RhsProgress;
1120
+ blA += pk*3*Traits::LhsProgress;
1121
+
1122
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1123
+ }
1124
+
1125
+ // process remaining peeled loop
1126
+ for(Index k=peeled_kc; k<depth; k++)
1127
+ {
1128
+ RhsPacket B_0;
1129
+ EIGEN_GEBGP_ONESTEP(0);
1130
+ blB += RhsProgress;
1131
+ blA += 3*Traits::LhsProgress;
1132
+ }
1133
+ #undef EIGEN_GEBGP_ONESTEP
1134
+ ResPacket R0, R1, R2;
1135
+ ResPacket alphav = pset1<ResPacket>(alpha);
1136
+
1137
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139
+ R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140
+ traits.acc(C0, alphav, R0);
1141
+ traits.acc(C4, alphav, R1);
1142
+ traits.acc(C8, alphav, R2);
1143
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1144
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
1145
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
1146
+ }
1147
+ }
1148
+ }
1149
+ }
1150
+
1151
+ //---------- Process 2 * LhsProgress rows at once ----------
1152
+ if(mr>=2*Traits::LhsProgress)
1153
+ {
1154
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1155
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1156
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1157
+ // or because we are testing specific blocking sizes.
1158
+ Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1159
+
1160
+ for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1161
+ {
1162
+ Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1163
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
1164
+ {
1165
+ for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1166
+ {
1167
+
1168
+ // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1169
+ // stored into 2 x nr registers.
1170
+
1171
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1172
+ prefetch(&blA[0]);
1173
+
1174
+ // gets res block as register
1175
+ AccPacket C0, C1, C2, C3,
1176
+ C4, C5, C6, C7;
1177
+ traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1178
+ traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1179
+
1180
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1181
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1182
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1183
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1184
+
1185
+ r0.prefetch(prefetch_res_offset);
1186
+ r1.prefetch(prefetch_res_offset);
1187
+ r2.prefetch(prefetch_res_offset);
1188
+ r3.prefetch(prefetch_res_offset);
1189
+
1190
+ // performs "inner" products
1191
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1192
+ prefetch(&blB[0]);
1193
+ LhsPacket A0, A1;
1194
+
1195
+ for(Index k=0; k<peeled_kc; k+=pk)
1196
+ {
1197
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1198
+ RhsPacket B_0, B1, B2, B3, T0;
1199
+
1200
+ // NOTE: the begin/end asm comments below work around bug 935!
1201
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
1202
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1203
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1204
+ #else
1205
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1206
+ #endif
1207
+ #define EIGEN_GEBGP_ONESTEP(K) \
1208
+ do { \
1209
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213
+ traits.madd(A0, B_0, C0, T0); \
1214
+ traits.madd(A1, B_0, C4, B_0); \
1215
+ traits.madd(A0, B1, C1, T0); \
1216
+ traits.madd(A1, B1, C5, B1); \
1217
+ traits.madd(A0, B2, C2, T0); \
1218
+ traits.madd(A1, B2, C6, B2); \
1219
+ traits.madd(A0, B3, C3, T0); \
1220
+ traits.madd(A1, B3, C7, B3); \
1221
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1223
+ } while(false)
1224
+
1225
+ internal::prefetch(blB+(48+0));
1226
+ EIGEN_GEBGP_ONESTEP(0);
1227
+ EIGEN_GEBGP_ONESTEP(1);
1228
+ EIGEN_GEBGP_ONESTEP(2);
1229
+ EIGEN_GEBGP_ONESTEP(3);
1230
+ internal::prefetch(blB+(48+16));
1231
+ EIGEN_GEBGP_ONESTEP(4);
1232
+ EIGEN_GEBGP_ONESTEP(5);
1233
+ EIGEN_GEBGP_ONESTEP(6);
1234
+ EIGEN_GEBGP_ONESTEP(7);
1235
+
1236
+ blB += pk*4*RhsProgress;
1237
+ blA += pk*(2*Traits::LhsProgress);
1238
+
1239
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1240
+ }
1241
+ // process remaining peeled loop
1242
+ for(Index k=peeled_kc; k<depth; k++)
1243
+ {
1244
+ RhsPacket B_0, B1, B2, B3, T0;
1245
+ EIGEN_GEBGP_ONESTEP(0);
1246
+ blB += 4*RhsProgress;
1247
+ blA += 2*Traits::LhsProgress;
1248
+ }
1249
+ #undef EIGEN_GEBGP_ONESTEP
1250
+
1251
+ ResPacket R0, R1, R2, R3;
1252
+ ResPacket alphav = pset1<ResPacket>(alpha);
1253
+
1254
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256
+ R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257
+ R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1258
+ traits.acc(C0, alphav, R0);
1259
+ traits.acc(C4, alphav, R1);
1260
+ traits.acc(C1, alphav, R2);
1261
+ traits.acc(C5, alphav, R3);
1262
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1263
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
1264
+ r1.storePacket(0 * Traits::ResPacketSize, R2);
1265
+ r1.storePacket(1 * Traits::ResPacketSize, R3);
1266
+
1267
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268
+ R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269
+ R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270
+ R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1271
+ traits.acc(C2, alphav, R0);
1272
+ traits.acc(C6, alphav, R1);
1273
+ traits.acc(C3, alphav, R2);
1274
+ traits.acc(C7, alphav, R3);
1275
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
1276
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
1277
+ r3.storePacket(0 * Traits::ResPacketSize, R2);
1278
+ r3.storePacket(1 * Traits::ResPacketSize, R3);
1279
+ }
1280
+ }
1281
+
1282
+ // Deal with remaining columns of the rhs
1283
+ for(Index j2=packet_cols4; j2<cols; j2++)
1284
+ {
1285
+ for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1286
+ {
1287
+ // One column at a time
1288
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1289
+ prefetch(&blA[0]);
1290
+
1291
+ // gets res block as register
1292
+ AccPacket C0, C4;
1293
+ traits.initAcc(C0);
1294
+ traits.initAcc(C4);
1295
+
1296
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1297
+ r0.prefetch(prefetch_res_offset);
1298
+
1299
+ // performs "inner" products
1300
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1301
+ LhsPacket A0, A1;
1302
+
1303
+ for(Index k=0; k<peeled_kc; k+=pk)
1304
+ {
1305
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1306
+ RhsPacket B_0, B1;
1307
+
1308
+ #define EIGEN_GEBGP_ONESTEP(K) \
1309
+ do { \
1310
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1311
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1312
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315
+ traits.madd(A0, B_0, C0, B1); \
1316
+ traits.madd(A1, B_0, C4, B_0); \
1317
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1318
+ } while(false)
1319
+
1320
+ EIGEN_GEBGP_ONESTEP(0);
1321
+ EIGEN_GEBGP_ONESTEP(1);
1322
+ EIGEN_GEBGP_ONESTEP(2);
1323
+ EIGEN_GEBGP_ONESTEP(3);
1324
+ EIGEN_GEBGP_ONESTEP(4);
1325
+ EIGEN_GEBGP_ONESTEP(5);
1326
+ EIGEN_GEBGP_ONESTEP(6);
1327
+ EIGEN_GEBGP_ONESTEP(7);
1328
+
1329
+ blB += pk*RhsProgress;
1330
+ blA += pk*2*Traits::LhsProgress;
1331
+
1332
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1333
+ }
1334
+
1335
+ // process remaining peeled loop
1336
+ for(Index k=peeled_kc; k<depth; k++)
1337
+ {
1338
+ RhsPacket B_0, B1;
1339
+ EIGEN_GEBGP_ONESTEP(0);
1340
+ blB += RhsProgress;
1341
+ blA += 2*Traits::LhsProgress;
1342
+ }
1343
+ #undef EIGEN_GEBGP_ONESTEP
1344
+ ResPacket R0, R1;
1345
+ ResPacket alphav = pset1<ResPacket>(alpha);
1346
+
1347
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1349
+ traits.acc(C0, alphav, R0);
1350
+ traits.acc(C4, alphav, R1);
1351
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1352
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+ //---------- Process 1 * LhsProgress rows at once ----------
1358
+ if(mr>=1*Traits::LhsProgress)
1359
+ {
1360
+ // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1361
+ for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1362
+ {
1363
+ // loops on each largest micro vertical panel of rhs (depth * nr)
1364
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
1365
+ {
1366
+ // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1367
+ // stored into 1 x nr registers.
1368
+
1369
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1370
+ prefetch(&blA[0]);
1371
+
1372
+ // gets res block as register
1373
+ AccPacket C0, C1, C2, C3;
1374
+ traits.initAcc(C0);
1375
+ traits.initAcc(C1);
1376
+ traits.initAcc(C2);
1377
+ traits.initAcc(C3);
1378
+
1379
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1380
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1381
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1382
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1383
+
1384
+ r0.prefetch(prefetch_res_offset);
1385
+ r1.prefetch(prefetch_res_offset);
1386
+ r2.prefetch(prefetch_res_offset);
1387
+ r3.prefetch(prefetch_res_offset);
1388
+
1389
+ // performs "inner" products
1390
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1391
+ prefetch(&blB[0]);
1392
+ LhsPacket A0;
1393
+
1394
+ for(Index k=0; k<peeled_kc; k+=pk)
1395
+ {
1396
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1397
+ RhsPacket B_0, B1, B2, B3;
1398
+
1399
+ #define EIGEN_GEBGP_ONESTEP(K) \
1400
+ do { \
1401
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405
+ traits.madd(A0, B_0, C0, B_0); \
1406
+ traits.madd(A0, B1, C1, B1); \
1407
+ traits.madd(A0, B2, C2, B2); \
1408
+ traits.madd(A0, B3, C3, B3); \
1409
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1410
+ } while(false)
1411
+
1412
+ internal::prefetch(blB+(48+0));
1413
+ EIGEN_GEBGP_ONESTEP(0);
1414
+ EIGEN_GEBGP_ONESTEP(1);
1415
+ EIGEN_GEBGP_ONESTEP(2);
1416
+ EIGEN_GEBGP_ONESTEP(3);
1417
+ internal::prefetch(blB+(48+16));
1418
+ EIGEN_GEBGP_ONESTEP(4);
1419
+ EIGEN_GEBGP_ONESTEP(5);
1420
+ EIGEN_GEBGP_ONESTEP(6);
1421
+ EIGEN_GEBGP_ONESTEP(7);
1422
+
1423
+ blB += pk*4*RhsProgress;
1424
+ blA += pk*1*LhsProgress;
1425
+
1426
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1427
+ }
1428
+ // process remaining peeled loop
1429
+ for(Index k=peeled_kc; k<depth; k++)
1430
+ {
1431
+ RhsPacket B_0, B1, B2, B3;
1432
+ EIGEN_GEBGP_ONESTEP(0);
1433
+ blB += 4*RhsProgress;
1434
+ blA += 1*LhsProgress;
1435
+ }
1436
+ #undef EIGEN_GEBGP_ONESTEP
1437
+
1438
+ ResPacket R0, R1;
1439
+ ResPacket alphav = pset1<ResPacket>(alpha);
1440
+
1441
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442
+ R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443
+ traits.acc(C0, alphav, R0);
1444
+ traits.acc(C1, alphav, R1);
1445
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1446
+ r1.storePacket(0 * Traits::ResPacketSize, R1);
1447
+
1448
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449
+ R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450
+ traits.acc(C2, alphav, R0);
1451
+ traits.acc(C3, alphav, R1);
1452
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
1453
+ r3.storePacket(0 * Traits::ResPacketSize, R1);
1454
+ }
1455
+
1456
+ // Deal with remaining columns of the rhs
1457
+ for(Index j2=packet_cols4; j2<cols; j2++)
1458
+ {
1459
+ // One column at a time
1460
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1461
+ prefetch(&blA[0]);
1462
+
1463
+ // gets res block as register
1464
+ AccPacket C0;
1465
+ traits.initAcc(C0);
1466
+
1467
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1468
+
1469
+ // performs "inner" products
1470
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1471
+ LhsPacket A0;
1472
+
1473
+ for(Index k=0; k<peeled_kc; k+=pk)
1474
+ {
1475
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1476
+ RhsPacket B_0;
1477
+
1478
+ #define EIGEN_GEBGP_ONESTEP(K) \
1479
+ do { \
1480
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484
+ traits.madd(A0, B_0, C0, B_0); \
1485
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1486
+ } while(false);
1487
+
1488
+ EIGEN_GEBGP_ONESTEP(0);
1489
+ EIGEN_GEBGP_ONESTEP(1);
1490
+ EIGEN_GEBGP_ONESTEP(2);
1491
+ EIGEN_GEBGP_ONESTEP(3);
1492
+ EIGEN_GEBGP_ONESTEP(4);
1493
+ EIGEN_GEBGP_ONESTEP(5);
1494
+ EIGEN_GEBGP_ONESTEP(6);
1495
+ EIGEN_GEBGP_ONESTEP(7);
1496
+
1497
+ blB += pk*RhsProgress;
1498
+ blA += pk*1*Traits::LhsProgress;
1499
+
1500
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1501
+ }
1502
+
1503
+ // process remaining peeled loop
1504
+ for(Index k=peeled_kc; k<depth; k++)
1505
+ {
1506
+ RhsPacket B_0;
1507
+ EIGEN_GEBGP_ONESTEP(0);
1508
+ blB += RhsProgress;
1509
+ blA += 1*Traits::LhsProgress;
1510
+ }
1511
+ #undef EIGEN_GEBGP_ONESTEP
1512
+ ResPacket R0;
1513
+ ResPacket alphav = pset1<ResPacket>(alpha);
1514
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515
+ traits.acc(C0, alphav, R0);
1516
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1517
+ }
1518
+ }
1519
+ }
1520
+ //---------- Process remaining rows, 1 at once ----------
1521
+ if(peeled_mc1<rows)
1522
+ {
1523
+ // loop on each panel of the rhs
1524
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
1525
+ {
1526
+ // loop on each row of the lhs (1*LhsProgress x depth)
1527
+ for(Index i=peeled_mc1; i<rows; i+=1)
1528
+ {
1529
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
1530
+ prefetch(&blA[0]);
1531
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1532
+
1533
+ // The following piece of code wont work for 512 bit registers
1534
+ // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1535
+ // as nr (which is currently 4) for the return type.
1536
+ const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1537
+ if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538
+ (SwappedTraits::LhsProgress <= 8) &&
1539
+ (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1540
+ {
1541
+ SAccPacket C0, C1, C2, C3;
1542
+ straits.initAcc(C0);
1543
+ straits.initAcc(C1);
1544
+ straits.initAcc(C2);
1545
+ straits.initAcc(C3);
1546
+
1547
+ const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1548
+ const Index endk = (depth/spk)*spk;
1549
+ const Index endk4 = (depth/(spk*4))*(spk*4);
1550
+
1551
+ Index k=0;
1552
+ for(; k<endk4; k+=4*spk)
1553
+ {
1554
+ SLhsPacket A0,A1;
1555
+ SRhsPacket B_0,B_1;
1556
+
1557
+ straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1558
+ straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1559
+
1560
+ straits.loadRhsQuad(blA+0*spk, B_0);
1561
+ straits.loadRhsQuad(blA+1*spk, B_1);
1562
+ straits.madd(A0,B_0,C0,B_0);
1563
+ straits.madd(A1,B_1,C1,B_1);
1564
+
1565
+ straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1566
+ straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1567
+ straits.loadRhsQuad(blA+2*spk, B_0);
1568
+ straits.loadRhsQuad(blA+3*spk, B_1);
1569
+ straits.madd(A0,B_0,C2,B_0);
1570
+ straits.madd(A1,B_1,C3,B_1);
1571
+
1572
+ blB += 4*SwappedTraits::LhsProgress;
1573
+ blA += 4*spk;
1574
+ }
1575
+ C0 = padd(padd(C0,C1),padd(C2,C3));
1576
+ for(; k<endk; k+=spk)
1577
+ {
1578
+ SLhsPacket A0;
1579
+ SRhsPacket B_0;
1580
+
1581
+ straits.loadLhsUnaligned(blB, A0);
1582
+ straits.loadRhsQuad(blA, B_0);
1583
+ straits.madd(A0,B_0,C0,B_0);
1584
+
1585
+ blB += SwappedTraits::LhsProgress;
1586
+ blA += spk;
1587
+ }
1588
+ if(SwappedTraits::LhsProgress==8)
1589
+ {
1590
+ // Special case where we have to first reduce the accumulation register C0
1591
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1592
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1593
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1594
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1595
+
1596
+ SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1597
+ SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1598
+
1599
+ if(depth-endk>0)
1600
+ {
1601
+ // We have to handle the last row of the rhs which corresponds to a half-packet
1602
+ SLhsPacketHalf a0;
1603
+ SRhsPacketHalf b0;
1604
+ straits.loadLhsUnaligned(blB, a0);
1605
+ straits.loadRhs(blA, b0);
1606
+ SAccPacketHalf c0 = predux_downto4(C0);
1607
+ straits.madd(a0,b0,c0,b0);
1608
+ straits.acc(c0, alphav, R);
1609
+ }
1610
+ else
1611
+ {
1612
+ straits.acc(predux_downto4(C0), alphav, R);
1613
+ }
1614
+ res.scatterPacket(i, j2, R);
1615
+ }
1616
+ else
1617
+ {
1618
+ SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1619
+ SResPacket alphav = pset1<SResPacket>(alpha);
1620
+ straits.acc(C0, alphav, R);
1621
+ res.scatterPacket(i, j2, R);
1622
+ }
1623
+ }
1624
+ else // scalar path
1625
+ {
1626
+ // get a 1 x 4 res block as registers
1627
+ ResScalar C0(0), C1(0), C2(0), C3(0);
1628
+
1629
+ for(Index k=0; k<depth; k++)
1630
+ {
1631
+ LhsScalar A0;
1632
+ RhsScalar B_0, B_1;
1633
+
1634
+ A0 = blA[k];
1635
+
1636
+ B_0 = blB[0];
1637
+ B_1 = blB[1];
1638
+ CJMADD(cj,A0,B_0,C0, B_0);
1639
+ CJMADD(cj,A0,B_1,C1, B_1);
1640
+
1641
+ B_0 = blB[2];
1642
+ B_1 = blB[3];
1643
+ CJMADD(cj,A0,B_0,C2, B_0);
1644
+ CJMADD(cj,A0,B_1,C3, B_1);
1645
+
1646
+ blB += 4;
1647
+ }
1648
+ res(i, j2 + 0) += alpha * C0;
1649
+ res(i, j2 + 1) += alpha * C1;
1650
+ res(i, j2 + 2) += alpha * C2;
1651
+ res(i, j2 + 3) += alpha * C3;
1652
+ }
1653
+ }
1654
+ }
1655
+ // remaining columns
1656
+ for(Index j2=packet_cols4; j2<cols; j2++)
1657
+ {
1658
+ // loop on each row of the lhs (1*LhsProgress x depth)
1659
+ for(Index i=peeled_mc1; i<rows; i+=1)
1660
+ {
1661
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
1662
+ prefetch(&blA[0]);
1663
+ // gets a 1 x 1 res block as registers
1664
+ ResScalar C0(0);
1665
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666
+ for(Index k=0; k<depth; k++)
1667
+ {
1668
+ LhsScalar A0 = blA[k];
1669
+ RhsScalar B_0 = blB[k];
1670
+ CJMADD(cj, A0, B_0, C0, B_0);
1671
+ }
1672
+ res(i, j2) += alpha * C0;
1673
+ }
1674
+ }
1675
+ }
1676
+ }
1677
+
1678
+
1679
+ #undef CJMADD
1680
+
1681
+ // pack a block of the lhs
1682
+ // The traversal is as follow (mr==4):
1683
+ // 0 4 8 12 ...
1684
+ // 1 5 9 13 ...
1685
+ // 2 6 10 14 ...
1686
+ // 3 7 11 15 ...
1687
+ //
1688
+ // 16 20 24 28 ...
1689
+ // 17 21 25 29 ...
1690
+ // 18 22 26 30 ...
1691
+ // 19 23 27 31 ...
1692
+ //
1693
+ // 32 33 34 35 ...
1694
+ // 36 36 38 39 ...
1695
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1696
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1697
+ {
1698
+ typedef typename DataMapper::LinearMapper LinearMapper;
1699
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1700
+ };
1701
+
1702
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1703
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1704
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1705
+ {
1706
+ typedef typename packet_traits<Scalar>::type Packet;
1707
+ enum { PacketSize = packet_traits<Scalar>::size };
1708
+
1709
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1710
+ EIGEN_UNUSED_VARIABLE(stride);
1711
+ EIGEN_UNUSED_VARIABLE(offset);
1712
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1713
+ eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1714
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1715
+ Index count = 0;
1716
+
1717
+ const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718
+ const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720
+ const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721
+ : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1722
+
1723
+ Index i=0;
1724
+
1725
+ // Pack 3 packets
1726
+ if(Pack1>=3*PacketSize)
1727
+ {
1728
+ for(; i<peeled_mc3; i+=3*PacketSize)
1729
+ {
1730
+ if(PanelMode) count += (3*PacketSize) * offset;
1731
+
1732
+ for(Index k=0; k<depth; k++)
1733
+ {
1734
+ Packet A, B, C;
1735
+ A = lhs.loadPacket(i+0*PacketSize, k);
1736
+ B = lhs.loadPacket(i+1*PacketSize, k);
1737
+ C = lhs.loadPacket(i+2*PacketSize, k);
1738
+ pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1739
+ pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1740
+ pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1741
+ }
1742
+ if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1743
+ }
1744
+ }
1745
+ // Pack 2 packets
1746
+ if(Pack1>=2*PacketSize)
1747
+ {
1748
+ for(; i<peeled_mc2; i+=2*PacketSize)
1749
+ {
1750
+ if(PanelMode) count += (2*PacketSize) * offset;
1751
+
1752
+ for(Index k=0; k<depth; k++)
1753
+ {
1754
+ Packet A, B;
1755
+ A = lhs.loadPacket(i+0*PacketSize, k);
1756
+ B = lhs.loadPacket(i+1*PacketSize, k);
1757
+ pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1758
+ pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1759
+ }
1760
+ if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1761
+ }
1762
+ }
1763
+ // Pack 1 packets
1764
+ if(Pack1>=1*PacketSize)
1765
+ {
1766
+ for(; i<peeled_mc1; i+=1*PacketSize)
1767
+ {
1768
+ if(PanelMode) count += (1*PacketSize) * offset;
1769
+
1770
+ for(Index k=0; k<depth; k++)
1771
+ {
1772
+ Packet A;
1773
+ A = lhs.loadPacket(i+0*PacketSize, k);
1774
+ pstore(blockA+count, cj.pconj(A));
1775
+ count+=PacketSize;
1776
+ }
1777
+ if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1778
+ }
1779
+ }
1780
+ // Pack scalars
1781
+ if(Pack2<PacketSize && Pack2>1)
1782
+ {
1783
+ for(; i<peeled_mc0; i+=Pack2)
1784
+ {
1785
+ if(PanelMode) count += Pack2 * offset;
1786
+
1787
+ for(Index k=0; k<depth; k++)
1788
+ for(Index w=0; w<Pack2; w++)
1789
+ blockA[count++] = cj(lhs(i+w, k));
1790
+
1791
+ if(PanelMode) count += Pack2 * (stride-offset-depth);
1792
+ }
1793
+ }
1794
+ for(; i<rows; i++)
1795
+ {
1796
+ if(PanelMode) count += offset;
1797
+ for(Index k=0; k<depth; k++)
1798
+ blockA[count++] = cj(lhs(i, k));
1799
+ if(PanelMode) count += (stride-offset-depth);
1800
+ }
1801
+ }
1802
+
1803
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1804
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1805
+ {
1806
+ typedef typename DataMapper::LinearMapper LinearMapper;
1807
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1808
+ };
1809
+
1810
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1811
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1812
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1813
+ {
1814
+ typedef typename packet_traits<Scalar>::type Packet;
1815
+ enum { PacketSize = packet_traits<Scalar>::size };
1816
+
1817
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1818
+ EIGEN_UNUSED_VARIABLE(stride);
1819
+ EIGEN_UNUSED_VARIABLE(offset);
1820
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1821
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1822
+ Index count = 0;
1823
+
1824
+ // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1825
+ // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1826
+ // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1827
+
1828
+ int pack = Pack1;
1829
+ Index i = 0;
1830
+ while(pack>0)
1831
+ {
1832
+ Index remaining_rows = rows-i;
1833
+ Index peeled_mc = i+(remaining_rows/pack)*pack;
1834
+ for(; i<peeled_mc; i+=pack)
1835
+ {
1836
+ if(PanelMode) count += pack * offset;
1837
+
1838
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
1839
+ Index k=0;
1840
+ if(pack>=PacketSize)
1841
+ {
1842
+ for(; k<peeled_k; k+=PacketSize)
1843
+ {
1844
+ for (Index m = 0; m < pack; m += PacketSize)
1845
+ {
1846
+ PacketBlock<Packet> kernel;
1847
+ for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1848
+ ptranspose(kernel);
1849
+ for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
1850
+ }
1851
+ count += PacketSize*pack;
1852
+ }
1853
+ }
1854
+ for(; k<depth; k++)
1855
+ {
1856
+ Index w=0;
1857
+ for(; w<pack-3; w+=4)
1858
+ {
1859
+ Scalar a(cj(lhs(i+w+0, k))),
1860
+ b(cj(lhs(i+w+1, k))),
1861
+ c(cj(lhs(i+w+2, k))),
1862
+ d(cj(lhs(i+w+3, k)));
1863
+ blockA[count++] = a;
1864
+ blockA[count++] = b;
1865
+ blockA[count++] = c;
1866
+ blockA[count++] = d;
1867
+ }
1868
+ if(pack%4)
1869
+ for(;w<pack;++w)
1870
+ blockA[count++] = cj(lhs(i+w, k));
1871
+ }
1872
+
1873
+ if(PanelMode) count += pack * (stride-offset-depth);
1874
+ }
1875
+
1876
+ pack -= PacketSize;
1877
+ if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1878
+ pack = Pack2;
1879
+ }
1880
+
1881
+ for(; i<rows; i++)
1882
+ {
1883
+ if(PanelMode) count += offset;
1884
+ for(Index k=0; k<depth; k++)
1885
+ blockA[count++] = cj(lhs(i, k));
1886
+ if(PanelMode) count += (stride-offset-depth);
1887
+ }
1888
+ }
1889
+
1890
+ // copy a complete panel of the rhs
1891
+ // this version is optimized for column major matrices
1892
+ // The traversal order is as follow: (nr==4):
1893
+ // 0 1 2 3 12 13 14 15 24 27
1894
+ // 4 5 6 7 16 17 18 19 25 28
1895
+ // 8 9 10 11 20 21 22 23 26 29
1896
+ // . . . . . . . . . .
1897
+ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1898
+ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1899
+ {
1900
+ typedef typename packet_traits<Scalar>::type Packet;
1901
+ typedef typename DataMapper::LinearMapper LinearMapper;
1902
+ enum { PacketSize = packet_traits<Scalar>::size };
1903
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
1904
+ };
1905
+
1906
+ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1907
+ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1908
+ ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
1909
+ {
1910
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
1911
+ EIGEN_UNUSED_VARIABLE(stride);
1912
+ EIGEN_UNUSED_VARIABLE(offset);
1913
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1914
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1915
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1916
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1917
+ Index count = 0;
1918
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
1919
+ // if(nr>=8)
1920
+ // {
1921
+ // for(Index j2=0; j2<packet_cols8; j2+=8)
1922
+ // {
1923
+ // // skip what we have before
1924
+ // if(PanelMode) count += 8 * offset;
1925
+ // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
1926
+ // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
1927
+ // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
1928
+ // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
1929
+ // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
1930
+ // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
1931
+ // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1932
+ // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1933
+ // Index k=0;
1934
+ // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
1935
+ // {
1936
+ // for(; k<peeled_k; k+=PacketSize) {
1937
+ // PacketBlock<Packet> kernel;
1938
+ // for (int p = 0; p < PacketSize; ++p) {
1939
+ // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
1940
+ // }
1941
+ // ptranspose(kernel);
1942
+ // for (int p = 0; p < PacketSize; ++p) {
1943
+ // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
1944
+ // count+=PacketSize;
1945
+ // }
1946
+ // }
1947
+ // }
1948
+ // for(; k<depth; k++)
1949
+ // {
1950
+ // blockB[count+0] = cj(b0[k]);
1951
+ // blockB[count+1] = cj(b1[k]);
1952
+ // blockB[count+2] = cj(b2[k]);
1953
+ // blockB[count+3] = cj(b3[k]);
1954
+ // blockB[count+4] = cj(b4[k]);
1955
+ // blockB[count+5] = cj(b5[k]);
1956
+ // blockB[count+6] = cj(b6[k]);
1957
+ // blockB[count+7] = cj(b7[k]);
1958
+ // count += 8;
1959
+ // }
1960
+ // // skip what we have after
1961
+ // if(PanelMode) count += 8 * (stride-offset-depth);
1962
+ // }
1963
+ // }
1964
+
1965
+ if(nr>=4)
1966
+ {
1967
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1968
+ {
1969
+ // skip what we have before
1970
+ if(PanelMode) count += 4 * offset;
1971
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1972
+ const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1973
+ const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1974
+ const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1975
+
1976
+ Index k=0;
1977
+ if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
1978
+ {
1979
+ for(; k<peeled_k; k+=PacketSize) {
1980
+ PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1981
+ kernel.packet[0] = dm0.loadPacket(k);
1982
+ kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1983
+ kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1984
+ kernel.packet[3%PacketSize] = dm3.loadPacket(k);
1985
+ ptranspose(kernel);
1986
+ pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1987
+ pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1988
+ pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1989
+ pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1990
+ count+=4*PacketSize;
1991
+ }
1992
+ }
1993
+ for(; k<depth; k++)
1994
+ {
1995
+ blockB[count+0] = cj(dm0(k));
1996
+ blockB[count+1] = cj(dm1(k));
1997
+ blockB[count+2] = cj(dm2(k));
1998
+ blockB[count+3] = cj(dm3(k));
1999
+ count += 4;
2000
+ }
2001
+ // skip what we have after
2002
+ if(PanelMode) count += 4 * (stride-offset-depth);
2003
+ }
2004
+ }
2005
+
2006
+ // copy the remaining columns one at a time (nr==1)
2007
+ for(Index j2=packet_cols4; j2<cols; ++j2)
2008
+ {
2009
+ if(PanelMode) count += offset;
2010
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2011
+ for(Index k=0; k<depth; k++)
2012
+ {
2013
+ blockB[count] = cj(dm0(k));
2014
+ count += 1;
2015
+ }
2016
+ if(PanelMode) count += (stride-offset-depth);
2017
+ }
2018
+ }
2019
+
2020
+ // this version is optimized for row major matrices
2021
+ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2022
+ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2023
+ {
2024
+ typedef typename packet_traits<Scalar>::type Packet;
2025
+ typedef typename DataMapper::LinearMapper LinearMapper;
2026
+ enum { PacketSize = packet_traits<Scalar>::size };
2027
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2028
+ };
2029
+
2030
+ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2031
+ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2032
+ ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2033
+ {
2034
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2035
+ EIGEN_UNUSED_VARIABLE(stride);
2036
+ EIGEN_UNUSED_VARIABLE(offset);
2037
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2038
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2039
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2041
+ Index count = 0;
2042
+
2043
+ // if(nr>=8)
2044
+ // {
2045
+ // for(Index j2=0; j2<packet_cols8; j2+=8)
2046
+ // {
2047
+ // // skip what we have before
2048
+ // if(PanelMode) count += 8 * offset;
2049
+ // for(Index k=0; k<depth; k++)
2050
+ // {
2051
+ // if (PacketSize==8) {
2052
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2053
+ // pstoreu(blockB+count, cj.pconj(A));
2054
+ // } else if (PacketSize==4) {
2055
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2056
+ // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2057
+ // pstoreu(blockB+count, cj.pconj(A));
2058
+ // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2059
+ // } else {
2060
+ // const Scalar* b0 = &rhs[k*rhsStride + j2];
2061
+ // blockB[count+0] = cj(b0[0]);
2062
+ // blockB[count+1] = cj(b0[1]);
2063
+ // blockB[count+2] = cj(b0[2]);
2064
+ // blockB[count+3] = cj(b0[3]);
2065
+ // blockB[count+4] = cj(b0[4]);
2066
+ // blockB[count+5] = cj(b0[5]);
2067
+ // blockB[count+6] = cj(b0[6]);
2068
+ // blockB[count+7] = cj(b0[7]);
2069
+ // }
2070
+ // count += 8;
2071
+ // }
2072
+ // // skip what we have after
2073
+ // if(PanelMode) count += 8 * (stride-offset-depth);
2074
+ // }
2075
+ // }
2076
+ if(nr>=4)
2077
+ {
2078
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2079
+ {
2080
+ // skip what we have before
2081
+ if(PanelMode) count += 4 * offset;
2082
+ for(Index k=0; k<depth; k++)
2083
+ {
2084
+ if (PacketSize==4) {
2085
+ Packet A = rhs.loadPacket(k, j2);
2086
+ pstoreu(blockB+count, cj.pconj(A));
2087
+ count += PacketSize;
2088
+ } else {
2089
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2090
+ blockB[count+0] = cj(dm0(0));
2091
+ blockB[count+1] = cj(dm0(1));
2092
+ blockB[count+2] = cj(dm0(2));
2093
+ blockB[count+3] = cj(dm0(3));
2094
+ count += 4;
2095
+ }
2096
+ }
2097
+ // skip what we have after
2098
+ if(PanelMode) count += 4 * (stride-offset-depth);
2099
+ }
2100
+ }
2101
+ // copy the remaining columns one at a time (nr==1)
2102
+ for(Index j2=packet_cols4; j2<cols; ++j2)
2103
+ {
2104
+ if(PanelMode) count += offset;
2105
+ for(Index k=0; k<depth; k++)
2106
+ {
2107
+ blockB[count] = cj(rhs(k, j2));
2108
+ count += 1;
2109
+ }
2110
+ if(PanelMode) count += stride-offset-depth;
2111
+ }
2112
+ }
2113
+
2114
+ } // end namespace internal
2115
+
2116
+ /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
2117
+ * \sa setCpuCacheSize */
2118
+ inline std::ptrdiff_t l1CacheSize()
2119
+ {
2120
+ std::ptrdiff_t l1, l2, l3;
2121
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2122
+ return l1;
2123
+ }
2124
+
2125
+ /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
2126
+ * \sa setCpuCacheSize */
2127
+ inline std::ptrdiff_t l2CacheSize()
2128
+ {
2129
+ std::ptrdiff_t l1, l2, l3;
2130
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2131
+ return l2;
2132
+ }
2133
+
2134
+ /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
2135
+ rs.
2136
+ * \sa setCpuCacheSize */
2137
+ inline std::ptrdiff_t l3CacheSize()
2138
+ {
2139
+ std::ptrdiff_t l1, l2, l3;
2140
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2141
+ return l3;
2142
+ }
2143
+
2144
+ /** Set the cpu L1 and L2 cache sizes (in bytes).
2145
+ * These values are use to adjust the size of the blocks
2146
+ * for the algorithms working per blocks.
2147
+ *
2148
+ * \sa computeProductBlockingSizes */
2149
+ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2150
+ {
2151
+ internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2152
+ }
2153
+
2154
+ } // end namespace Eigen
2155
+
2156
+ #endif // EIGEN_GENERAL_BLOCK_PANEL_H