tomoto 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (347) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/extconf.rb +6 -2
  5. data/ext/tomoto/{ext.cpp → tomoto.cpp} +1 -1
  6. data/lib/tomoto/version.rb +1 -1
  7. data/lib/tomoto.rb +5 -1
  8. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  9. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  10. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  11. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  12. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  13. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  14. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  15. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  16. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  17. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  18. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  19. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  20. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  21. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  22. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  23. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  24. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  25. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  26. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  27. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  28. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  29. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  30. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  31. data/vendor/EigenRand/README.md +57 -4
  32. data/vendor/eigen/COPYING.APACHE +203 -0
  33. data/vendor/eigen/COPYING.BSD +1 -1
  34. data/vendor/eigen/COPYING.MINPACK +51 -52
  35. data/vendor/eigen/Eigen/Cholesky +0 -1
  36. data/vendor/eigen/Eigen/Core +112 -265
  37. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  38. data/vendor/eigen/Eigen/Geometry +5 -8
  39. data/vendor/eigen/Eigen/Householder +0 -1
  40. data/vendor/eigen/Eigen/Jacobi +0 -1
  41. data/vendor/eigen/Eigen/KLUSupport +41 -0
  42. data/vendor/eigen/Eigen/LU +2 -5
  43. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  44. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  45. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  46. data/vendor/eigen/Eigen/QR +2 -3
  47. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  48. data/vendor/eigen/Eigen/SVD +0 -1
  49. data/vendor/eigen/Eigen/Sparse +0 -2
  50. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  51. data/vendor/eigen/Eigen/SparseLU +4 -0
  52. data/vendor/eigen/Eigen/SparseQR +0 -1
  53. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  54. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  55. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  56. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  57. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  58. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  59. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  60. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  61. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  62. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  63. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  64. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  65. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  66. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  67. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  68. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  69. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  70. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  71. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  72. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  73. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  74. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  75. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  76. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  77. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  78. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  79. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  80. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  81. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  82. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  83. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  84. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  85. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  86. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  87. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  88. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  89. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  90. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  91. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  92. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  93. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  94. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  95. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  96. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  97. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  98. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  99. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  100. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  101. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  102. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  103. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  104. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  105. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  106. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  107. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  108. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  109. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  110. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  111. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  112. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  113. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  114. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  115. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  116. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  117. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  118. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  119. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  120. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  121. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  122. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  123. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  124. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  125. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  126. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  127. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  128. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  129. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  130. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  131. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  132. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  134. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  135. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  139. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  140. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  142. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  146. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  148. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  155. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  157. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  158. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  160. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  161. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  162. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  169. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  171. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  172. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  173. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  174. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  175. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  176. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  177. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  178. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  179. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  180. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  181. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  182. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  183. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  184. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  185. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  186. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  187. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  188. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  189. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  190. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  191. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  192. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  193. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  194. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  195. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  196. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  197. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  198. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  199. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  200. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  201. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  202. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  203. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  204. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  205. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  206. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  207. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  208. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  209. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  210. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  211. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  212. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  213. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  214. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  215. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  216. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  217. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  218. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  219. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  220. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  221. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  222. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  223. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  224. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  225. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  226. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  227. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  228. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  229. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  230. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  231. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  232. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  233. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  234. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  235. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  236. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  237. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  238. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  239. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  240. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  241. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  242. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  243. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  244. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  245. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  246. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  247. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  248. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  249. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  250. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  251. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  252. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  253. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  254. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  255. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  256. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  257. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  258. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  259. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  260. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  261. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  262. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  263. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  264. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  265. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  266. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  267. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  268. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  269. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  270. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  271. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  283. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  287. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  288. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  289. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  290. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  291. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  292. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  293. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  294. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  295. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  296. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  297. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  298. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  299. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  300. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  301. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  302. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  303. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  304. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  305. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  306. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  307. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  308. data/vendor/eigen/README.md +2 -0
  309. data/vendor/eigen/bench/btl/README +1 -1
  310. data/vendor/eigen/bench/tensors/README +6 -7
  311. data/vendor/eigen/ci/README.md +56 -0
  312. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  313. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  314. data/vendor/eigen/unsupported/README.txt +1 -1
  315. data/vendor/tomotopy/README.kr.rst +21 -0
  316. data/vendor/tomotopy/README.rst +20 -0
  317. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  318. data/vendor/tomotopy/src/Labeling/Phraser.hpp +1 -1
  319. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +2 -1
  320. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +2 -1
  321. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +1 -1
  322. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  323. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  324. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +53 -2
  325. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +1 -1
  326. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +1 -0
  327. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +2 -2
  328. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +16 -5
  329. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  330. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
  331. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  332. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  333. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +31 -1
  334. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +2 -2
  335. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +7 -5
  336. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  337. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  338. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  339. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  340. metadata +60 -14
  341. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  342. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  343. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  344. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  345. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  346. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  347. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -15,7 +15,13 @@ namespace Eigen {
15
15
 
16
16
  namespace internal {
17
17
 
18
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
18
+ enum GEBPPacketSizeType {
19
+ GEBPPacketFull = 0,
20
+ GEBPPacketHalf,
21
+ GEBPPacketQuarter
22
+ };
23
+
24
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
19
25
  class gebp_traits;
20
26
 
21
27
 
@@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
25
31
  return a<=0 ? b : a;
26
32
  }
27
33
 
34
+ #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36
+ #else
37
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38
+ #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39
+
40
+ #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42
+ #else
43
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44
+ #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45
+
46
+ #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
48
+ #else
49
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50
+ #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51
+
28
52
  #if EIGEN_ARCH_i386_OR_x86_64
29
- const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30
- const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31
- const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
53
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
56
+ #elif EIGEN_ARCH_PPC
57
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
32
60
  #else
33
- const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34
- const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35
- const std::ptrdiff_t defaultL3CacheSize = 512*1024;
61
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
36
64
  #endif
37
65
 
66
+ #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67
+ #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68
+ #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69
+
38
70
  /** \internal */
39
71
  struct CacheSizes {
40
72
  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
@@ -50,7 +82,6 @@ struct CacheSizes {
50
82
  std::ptrdiff_t m_l3;
51
83
  };
52
84
 
53
-
54
85
  /** \internal */
55
86
  inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56
87
  {
@@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
101
132
  // at the register level. This small horizontal panel has to stay within L1 cache.
102
133
  std::ptrdiff_t l1, l2, l3;
103
134
  manage_caching_sizes(GetAction, &l1, &l2, &l3);
135
+ #ifdef EIGEN_VECTORIZE_AVX512
136
+ // We need to find a rationale for that, but without this adjustment,
137
+ // performance with AVX512 is pretty bad, like -20% slower.
138
+ // One reason is that with increasing packet-size, the blocking size k
139
+ // has to become pretty small if we want that 1 lhs panel fit within L1.
140
+ // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
141
+ // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142
+ // This is quite small for a good reuse of the accumulation registers.
143
+ l1 *= 4;
144
+ #endif
104
145
 
105
146
  if (num_threads > 1) {
106
147
  typedef typename Traits::ResScalar ResScalar;
@@ -115,7 +156,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
115
156
  // registers. However once the latency is hidden there is no point in
116
157
  // increasing the value of k, so we'll cap it at 320 (value determined
117
158
  // experimentally).
118
- const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
159
+ // To avoid that k vanishes, we make k_cache at least as big as kr
160
+ const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
119
161
  if (k_cache < k) {
120
162
  k = k_cache - (k_cache % kr);
121
163
  eigen_internal_assert(k > 0);
@@ -307,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
307
349
  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
308
350
  }
309
351
 
310
- #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
311
- #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
312
- #else
313
-
314
- // FIXME (a bit overkill maybe ?)
315
-
316
- template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
317
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
318
- {
319
- c = cj.pmadd(a,b,c);
320
- }
321
- };
322
-
323
- template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
324
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
325
- {
326
- t = b; t = cj.pmul(a,t); c = padd(c,t);
327
- }
328
- };
352
+ template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
353
+ struct RhsPanelHelper {
354
+ private:
355
+ static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
356
+ public:
357
+ typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
358
+ };
329
359
 
330
- template<typename CJ, typename A, typename B, typename C, typename T>
331
- EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
332
- {
333
- gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
334
- }
360
+ template <typename Packet>
361
+ struct QuadPacket
362
+ {
363
+ Packet B_0, B1, B2, B3;
364
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
365
+ const Packet& get(const FixedInt<1>&) const { return B1; }
366
+ const Packet& get(const FixedInt<2>&) const { return B2; }
367
+ const Packet& get(const FixedInt<3>&) const { return B3; }
368
+ };
335
369
 
336
- #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
337
- // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
338
- #endif
370
+ template <int N, typename T1, typename T2, typename T3>
371
+ struct packet_conditional { typedef T3 type; };
372
+
373
+ template <typename T1, typename T2, typename T3>
374
+ struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
375
+
376
+ template <typename T1, typename T2, typename T3>
377
+ struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
378
+
379
+ #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380
+ typedef typename packet_conditional<packet_size, \
381
+ typename packet_traits<name ## Scalar>::type, \
382
+ typename packet_traits<name ## Scalar>::half, \
383
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384
+ prefix ## name ## Packet
385
+
386
+ #define PACKET_DECL_COND(name, packet_size) \
387
+ typedef typename packet_conditional<packet_size, \
388
+ typename packet_traits<name ## Scalar>::type, \
389
+ typename packet_traits<name ## Scalar>::half, \
390
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391
+ name ## Packet
392
+
393
+ #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394
+ typedef typename packet_conditional<packet_size, \
395
+ typename packet_traits<Scalar>::type, \
396
+ typename packet_traits<Scalar>::half, \
397
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398
+ prefix ## ScalarPacket
399
+
400
+ #define PACKET_DECL_COND_SCALAR(packet_size) \
401
+ typedef typename packet_conditional<packet_size, \
402
+ typename packet_traits<Scalar>::type, \
403
+ typename packet_traits<Scalar>::half, \
404
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405
+ ScalarPacket
339
406
 
340
407
  /* Vectorization logic
341
408
  * real*real: unpack rhs to constant packets, ...
@@ -347,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
347
414
  * cplx*real : unpack rhs to constant packets, ...
348
415
  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
349
416
  */
350
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
417
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
351
418
  class gebp_traits
352
419
  {
353
420
  public:
@@ -355,13 +422,17 @@ public:
355
422
  typedef _RhsScalar RhsScalar;
356
423
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
357
424
 
425
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
426
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
427
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
428
+
358
429
  enum {
359
430
  ConjLhs = _ConjLhs,
360
431
  ConjRhs = _ConjRhs,
361
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
362
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
363
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
364
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
432
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
433
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
434
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
435
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
365
436
 
366
437
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
367
438
 
@@ -370,10 +441,12 @@ public:
370
441
 
371
442
  // register block size along the M direction (currently, this one cannot be modified)
372
443
  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
373
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
374
- // we assume 16 registers
444
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
445
+ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
446
+ // we assume 16 registers or more
375
447
  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
376
448
  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
449
+ // Bug 1515: MSVC prior to v19.14 yields to register spilling.
377
450
  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
378
451
  #else
379
452
  mr = default_mr,
@@ -383,37 +456,41 @@ public:
383
456
  RhsProgress = 1
384
457
  };
385
458
 
386
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
387
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
388
- typedef typename packet_traits<ResScalar>::type _ResPacket;
389
459
 
390
460
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
391
461
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
392
462
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
463
+ typedef LhsPacket LhsPacket4Packing;
393
464
 
465
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
394
466
  typedef ResPacket AccPacket;
395
467
 
396
468
  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
397
469
  {
398
470
  p = pset1<ResPacket>(ResScalar(0));
399
471
  }
400
-
401
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
402
- {
403
- pbroadcast4(b, b0, b1, b2, b3);
404
- }
405
-
406
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
407
- // {
408
- // pbroadcast2(b, b0, b1);
409
- // }
410
-
472
+
411
473
  template<typename RhsPacketType>
412
474
  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
413
475
  {
414
476
  dest = pset1<RhsPacketType>(*b);
415
477
  }
416
-
478
+
479
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
480
+ {
481
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
482
+ }
483
+
484
+ template<typename RhsPacketType>
485
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
486
+ {
487
+ loadRhs(b, dest);
488
+ }
489
+
490
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
491
+ {
492
+ }
493
+
417
494
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
418
495
  {
419
496
  dest = ploadquad<RhsPacket>(b);
@@ -431,8 +508,8 @@ public:
431
508
  dest = ploadu<LhsPacketType>(a);
432
509
  }
433
510
 
434
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
435
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
511
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
512
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
436
513
  {
437
514
  conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
438
515
  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -447,6 +524,12 @@ public:
447
524
  #endif
448
525
  }
449
526
 
527
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
528
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
529
+ {
530
+ madd(a, b.get(lane), c, tmp, lane);
531
+ }
532
+
450
533
  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
451
534
  {
452
535
  r = pmadd(c,alpha,r);
@@ -460,21 +543,25 @@ public:
460
543
 
461
544
  };
462
545
 
463
- template<typename RealScalar, bool _ConjLhs>
464
- class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
546
+ template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
547
+ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
465
548
  {
466
549
  public:
467
550
  typedef std::complex<RealScalar> LhsScalar;
468
551
  typedef RealScalar RhsScalar;
469
552
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
470
553
 
554
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
555
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
556
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
557
+
471
558
  enum {
472
559
  ConjLhs = _ConjLhs,
473
560
  ConjRhs = false,
474
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
475
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
476
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
477
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
561
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
562
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
478
565
 
479
566
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
480
567
  nr = 4,
@@ -489,13 +576,12 @@ public:
489
576
  RhsProgress = 1
490
577
  };
491
578
 
492
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
493
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
494
- typedef typename packet_traits<ResScalar>::type _ResPacket;
495
-
496
579
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
497
580
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
498
581
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
582
+ typedef LhsPacket LhsPacket4Packing;
583
+
584
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
499
585
 
500
586
  typedef ResPacket AccPacket;
501
587
 
@@ -504,42 +590,64 @@ public:
504
590
  p = pset1<ResPacket>(ResScalar(0));
505
591
  }
506
592
 
507
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
593
+ template<typename RhsPacketType>
594
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
508
595
  {
509
- dest = pset1<RhsPacket>(*b);
596
+ dest = pset1<RhsPacketType>(*b);
597
+ }
598
+
599
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
600
+ {
601
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
510
602
  }
603
+
604
+ template<typename RhsPacketType>
605
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
606
+ {
607
+ loadRhs(b, dest);
608
+ }
609
+
610
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
611
+ {}
511
612
 
512
613
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
513
614
  {
514
- dest = pset1<RhsPacket>(*b);
615
+ loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
515
616
  }
516
617
 
517
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
618
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
518
619
  {
519
- dest = pload<LhsPacket>(a);
620
+ // FIXME we can do better!
621
+ // what we want here is a ploadheight
622
+ RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623
+ dest = ploadquad<RhsPacket>(tmp);
520
624
  }
521
625
 
522
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
626
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
523
627
  {
524
- dest = ploadu<LhsPacket>(a);
628
+ eigen_internal_assert(RhsPacketSize<=8);
629
+ dest = pset1<RhsPacket>(*b);
525
630
  }
526
631
 
527
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
632
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
528
633
  {
529
- pbroadcast4(b, b0, b1, b2, b3);
634
+ dest = pload<LhsPacket>(a);
530
635
  }
531
-
532
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
533
- // {
534
- // pbroadcast2(b, b0, b1);
535
- // }
536
636
 
537
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
637
+ template<typename LhsPacketType>
638
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
639
+ {
640
+ dest = ploadu<LhsPacketType>(a);
641
+ }
642
+
643
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
644
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
538
645
  {
539
646
  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
540
647
  }
541
648
 
542
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
649
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
650
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
543
651
  {
544
652
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
545
653
  EIGEN_UNUSED_VARIABLE(tmp);
@@ -554,13 +662,20 @@ public:
554
662
  c += a * b;
555
663
  }
556
664
 
557
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
665
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
666
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
558
667
  {
668
+ madd(a, b.get(lane), c, tmp, lane);
669
+ }
670
+
671
+ template <typename ResPacketType, typename AccPacketType>
672
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
673
+ {
674
+ conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
559
675
  r = cj.pmadd(c,alpha,r);
560
676
  }
561
677
 
562
678
  protected:
563
- conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
564
679
  };
565
680
 
566
681
  template<typename Packet>
@@ -579,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
579
694
  return res;
580
695
  }
581
696
 
697
+ // note that for DoublePacket<RealPacket> the "4" in "downto4"
698
+ // corresponds to the number of complexes, so it means "8"
699
+ // it terms of real coefficients.
700
+
582
701
  template<typename Packet>
583
- const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
702
+ const DoublePacket<Packet>&
703
+ predux_half_dowto4(const DoublePacket<Packet> &a,
704
+ typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
584
705
  {
585
706
  return a;
586
707
  }
587
708
 
588
- template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
709
+ template<typename Packet>
710
+ DoublePacket<typename unpacket_traits<Packet>::half>
711
+ predux_half_dowto4(const DoublePacket<Packet> &a,
712
+ typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
713
+ {
714
+ // yes, that's pretty hackish :(
715
+ DoublePacket<typename unpacket_traits<Packet>::half> res;
716
+ typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717
+ typedef typename packet_traits<Cplx>::type CplxPacket;
718
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719
+ res.second = predux_half_dowto4(CplxPacket(a.second)).v;
720
+ return res;
721
+ }
722
+
723
+ // same here, "quad" actually means "8" in terms of real coefficients
724
+ template<typename Scalar, typename RealPacket>
725
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
726
+ typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
727
+ {
728
+ dest.first = pset1<RealPacket>(numext::real(*b));
729
+ dest.second = pset1<RealPacket>(numext::imag(*b));
730
+ }
731
+
732
+ template<typename Scalar, typename RealPacket>
733
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
734
+ typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
735
+ {
736
+ // yes, that's pretty hackish too :(
737
+ typedef typename NumTraits<Scalar>::Real RealScalar;
738
+ RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739
+ RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740
+ dest.first = ploadquad<RealPacket>(r);
741
+ dest.second = ploadquad<RealPacket>(i);
742
+ }
743
+
744
+
745
+ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
746
+ typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
747
+ };
589
748
  // template<typename Packet>
590
749
  // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
591
750
  // {
@@ -595,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
595
754
  // return res;
596
755
  // }
597
756
 
598
- template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
599
- class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
757
+ template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
758
+ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
600
759
  {
601
760
  public:
602
761
  typedef std::complex<RealScalar> Scalar;
@@ -604,15 +763,21 @@ public:
604
763
  typedef std::complex<RealScalar> RhsScalar;
605
764
  typedef std::complex<RealScalar> ResScalar;
606
765
 
766
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
767
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
768
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
769
+ PACKET_DECL_COND(Real, _PacketSize);
770
+ PACKET_DECL_COND_SCALAR(_PacketSize);
771
+
607
772
  enum {
608
773
  ConjLhs = _ConjLhs,
609
774
  ConjRhs = _ConjRhs,
610
- Vectorizable = packet_traits<RealScalar>::Vectorizable
611
- && packet_traits<Scalar>::Vectorizable,
612
- RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
613
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
614
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
615
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
775
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable
776
+ && unpacket_traits<ScalarPacket>::vectorizable,
777
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
779
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
616
781
 
617
782
  // FIXME: should depend on NumberOfRegisters
618
783
  nr = 4,
@@ -622,14 +787,16 @@ public:
622
787
  RhsProgress = 1
623
788
  };
624
789
 
625
- typedef typename packet_traits<RealScalar>::type RealPacket;
626
- typedef typename packet_traits<Scalar>::type ScalarPacket;
627
- typedef DoublePacket<RealPacket> DoublePacketType;
790
+ typedef DoublePacket<RealPacket> DoublePacketType;
628
791
 
792
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
629
793
  typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
630
794
  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
631
795
  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
632
796
  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
797
+
798
+ // this actualy holds 8 packets!
799
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
633
800
 
634
801
  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
635
802
 
@@ -640,51 +807,49 @@ public:
640
807
  }
641
808
 
642
809
  // Scalar path
643
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
810
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
644
811
  {
645
- dest = pset1<ResPacket>(*b);
812
+ dest = pset1<ScalarPacket>(*b);
646
813
  }
647
814
 
648
815
  // Vectorized path
649
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
816
+ template<typename RealPacketType>
817
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
650
818
  {
651
- dest.first = pset1<RealPacket>(real(*b));
652
- dest.second = pset1<RealPacket>(imag(*b));
819
+ dest.first = pset1<RealPacketType>(numext::real(*b));
820
+ dest.second = pset1<RealPacketType>(numext::imag(*b));
653
821
  }
654
-
655
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
822
+
823
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
656
824
  {
657
- loadRhs(b,dest);
825
+ loadRhs(b, dest.B_0);
826
+ loadRhs(b + 1, dest.B1);
827
+ loadRhs(b + 2, dest.B2);
828
+ loadRhs(b + 3, dest.B3);
658
829
  }
659
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
830
+
831
+ // Scalar path
832
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
660
833
  {
661
- eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
662
- loadRhs(b,dest);
834
+ loadRhs(b, dest);
663
835
  }
664
-
665
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
836
+
837
+ // Vectorized path
838
+ template<typename RealPacketType>
839
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
666
840
  {
667
- // FIXME not sure that's the best way to implement it!
668
- loadRhs(b+0, b0);
669
- loadRhs(b+1, b1);
670
- loadRhs(b+2, b2);
671
- loadRhs(b+3, b3);
841
+ loadRhs(b, dest);
672
842
  }
843
+
844
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
673
845
 
674
- // Vectorized path
675
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
846
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
676
847
  {
677
- // FIXME not sure that's the best way to implement it!
678
- loadRhs(b+0, b0);
679
- loadRhs(b+1, b1);
848
+ loadRhs(b,dest);
680
849
  }
681
-
682
- // Scalar path
683
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
850
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
684
851
  {
685
- // FIXME not sure that's the best way to implement it!
686
- loadRhs(b+0, b0);
687
- loadRhs(b+1, b1);
852
+ loadQuadToDoublePacket(b,dest);
688
853
  }
689
854
 
690
855
  // nothing special here
@@ -693,47 +858,59 @@ public:
693
858
  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
694
859
  }
695
860
 
696
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
861
+ template<typename LhsPacketType>
862
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
697
863
  {
698
- dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
864
+ dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
699
865
  }
700
866
 
701
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
867
+ template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
868
+ EIGEN_STRONG_INLINE
869
+ typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
870
+ madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
702
871
  {
703
872
  c.first = padd(pmul(a,b.first), c.first);
704
873
  c.second = padd(pmul(a,b.second),c.second);
705
874
  }
706
875
 
707
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
876
+ template<typename LaneIdType>
877
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
708
878
  {
709
879
  c = cj.pmadd(a,b,c);
710
880
  }
881
+
882
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
883
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
884
+ {
885
+ madd(a, b.get(lane), c, tmp, lane);
886
+ }
711
887
 
712
888
  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
713
889
 
714
- EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
890
+ template<typename RealPacketType, typename ResPacketType>
891
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
715
892
  {
716
893
  // assemble c
717
- ResPacket tmp;
894
+ ResPacketType tmp;
718
895
  if((!ConjLhs)&&(!ConjRhs))
719
896
  {
720
- tmp = pcplxflip(pconj(ResPacket(c.second)));
721
- tmp = padd(ResPacket(c.first),tmp);
897
+ tmp = pcplxflip(pconj(ResPacketType(c.second)));
898
+ tmp = padd(ResPacketType(c.first),tmp);
722
899
  }
723
900
  else if((!ConjLhs)&&(ConjRhs))
724
901
  {
725
- tmp = pconj(pcplxflip(ResPacket(c.second)));
726
- tmp = padd(ResPacket(c.first),tmp);
902
+ tmp = pconj(pcplxflip(ResPacketType(c.second)));
903
+ tmp = padd(ResPacketType(c.first),tmp);
727
904
  }
728
905
  else if((ConjLhs)&&(!ConjRhs))
729
906
  {
730
- tmp = pcplxflip(ResPacket(c.second));
731
- tmp = padd(pconj(ResPacket(c.first)),tmp);
907
+ tmp = pcplxflip(ResPacketType(c.second));
908
+ tmp = padd(pconj(ResPacketType(c.first)),tmp);
732
909
  }
733
910
  else if((ConjLhs)&&(ConjRhs))
734
911
  {
735
- tmp = pcplxflip(ResPacket(c.second));
736
- tmp = psub(pconj(ResPacket(c.first)),tmp);
912
+ tmp = pcplxflip(ResPacketType(c.second));
913
+ tmp = psub(pconj(ResPacketType(c.first)),tmp);
737
914
  }
738
915
 
739
916
  r = pmadd(tmp,alpha,r);
@@ -743,8 +920,8 @@ protected:
743
920
  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
744
921
  };
745
922
 
746
- template<typename RealScalar, bool _ConjRhs>
747
- class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
923
+ template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
924
+ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
748
925
  {
749
926
  public:
750
927
  typedef std::complex<RealScalar> Scalar;
@@ -752,14 +929,25 @@ public:
752
929
  typedef Scalar RhsScalar;
753
930
  typedef Scalar ResScalar;
754
931
 
932
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
933
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
934
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
935
+ PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
936
+ PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
937
+
938
+ #undef PACKET_DECL_COND_SCALAR_PREFIX
939
+ #undef PACKET_DECL_COND_PREFIX
940
+ #undef PACKET_DECL_COND_SCALAR
941
+ #undef PACKET_DECL_COND
942
+
755
943
  enum {
756
944
  ConjLhs = false,
757
945
  ConjRhs = _ConjRhs,
758
- Vectorizable = packet_traits<RealScalar>::Vectorizable
759
- && packet_traits<Scalar>::Vectorizable,
760
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
761
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
762
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
946
+ Vectorizable = unpacket_traits<_RealPacket>::vectorizable
947
+ && unpacket_traits<_ScalarPacket>::vectorizable,
948
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
763
951
 
764
952
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
765
953
  // FIXME: should depend on NumberOfRegisters
@@ -770,14 +958,11 @@ public:
770
958
  RhsProgress = 1
771
959
  };
772
960
 
773
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
774
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
775
- typedef typename packet_traits<ResScalar>::type _ResPacket;
776
-
777
961
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
778
962
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
779
963
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
780
-
964
+ typedef LhsPacket LhsPacket4Packing;
965
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
781
966
  typedef ResPacket AccPacket;
782
967
 
783
968
  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -785,22 +970,25 @@ public:
785
970
  p = pset1<ResPacket>(ResScalar(0));
786
971
  }
787
972
 
788
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
973
+ template<typename RhsPacketType>
974
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
789
975
  {
790
- dest = pset1<RhsPacket>(*b);
976
+ dest = pset1<RhsPacketType>(*b);
791
977
  }
792
-
793
- void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
978
+
979
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
794
980
  {
795
- pbroadcast4(b, b0, b1, b2, b3);
981
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
796
982
  }
797
-
798
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
799
- // {
800
- // // FIXME not sure that's the best way to implement it!
801
- // b0 = pload1<RhsPacket>(b+0);
802
- // b1 = pload1<RhsPacket>(b+1);
803
- // }
983
+
984
+ template<typename RhsPacketType>
985
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
986
+ {
987
+ loadRhs(b, dest);
988
+ }
989
+
990
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
991
+ {}
804
992
 
805
993
  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
806
994
  {
@@ -809,21 +997,23 @@ public:
809
997
 
810
998
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
811
999
  {
812
- eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
813
- loadRhs(b,dest);
1000
+ dest = ploadquad<RhsPacket>(b);
814
1001
  }
815
1002
 
816
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
1003
+ template<typename LhsPacketType>
1004
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
817
1005
  {
818
- dest = ploaddup<LhsPacket>(a);
1006
+ dest = ploaddup<LhsPacketType>(a);
819
1007
  }
820
1008
 
821
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
1009
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1010
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
822
1011
  {
823
1012
  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
824
1013
  }
825
1014
 
826
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
1015
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1016
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
827
1017
  {
828
1018
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
829
1019
  EIGEN_UNUSED_VARIABLE(tmp);
@@ -839,16 +1029,24 @@ public:
839
1029
  c += a * b;
840
1030
  }
841
1031
 
842
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
1032
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1033
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1034
+ {
1035
+ madd(a, b.get(lane), c, tmp, lane);
1036
+ }
1037
+
1038
+ template <typename ResPacketType, typename AccPacketType>
1039
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
843
1040
  {
1041
+ conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
844
1042
  r = cj.pmadd(alpha,c,r);
845
1043
  }
846
1044
 
847
1045
  protected:
848
- conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
1046
+
849
1047
  };
850
1048
 
851
- /* optimized GEneral packed Block * packed Panel product kernel
1049
+ /* optimized General packed Block * packed Panel product kernel
852
1050
  *
853
1051
  * Mixing type logic: C += A * B
854
1052
  * | A | B | comments
@@ -858,26 +1056,47 @@ protected:
858
1056
  template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
859
1057
  struct gebp_kernel
860
1058
  {
861
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
1059
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1060
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1061
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1062
+
862
1063
  typedef typename Traits::ResScalar ResScalar;
863
1064
  typedef typename Traits::LhsPacket LhsPacket;
864
1065
  typedef typename Traits::RhsPacket RhsPacket;
865
1066
  typedef typename Traits::ResPacket ResPacket;
866
1067
  typedef typename Traits::AccPacket AccPacket;
1068
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
1069
+
1070
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1071
+
1072
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
867
1073
 
868
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
869
1074
  typedef typename SwappedTraits::ResScalar SResScalar;
870
1075
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
871
1076
  typedef typename SwappedTraits::RhsPacket SRhsPacket;
872
1077
  typedef typename SwappedTraits::ResPacket SResPacket;
873
1078
  typedef typename SwappedTraits::AccPacket SAccPacket;
874
1079
 
1080
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1081
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1082
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
1083
+ typedef typename HalfTraits::AccPacket AccPacketHalf;
1084
+
1085
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1086
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1087
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1088
+ typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1089
+
875
1090
  typedef typename DataMapper::LinearMapper LinearMapper;
876
1091
 
877
1092
  enum {
878
1093
  Vectorizable = Traits::Vectorizable,
879
1094
  LhsProgress = Traits::LhsProgress,
1095
+ LhsProgressHalf = HalfTraits::LhsProgress,
1096
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
880
1097
  RhsProgress = Traits::RhsProgress,
1098
+ RhsProgressHalf = HalfTraits::RhsProgress,
1099
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
881
1100
  ResPacketSize = Traits::ResPacketSize
882
1101
  };
883
1102
 
@@ -887,6 +1106,299 @@ struct gebp_kernel
887
1106
  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
888
1107
  };
889
1108
 
1109
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1110
+ int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1111
+ struct last_row_process_16_packets
1112
+ {
1113
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1114
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1115
+
1116
+ typedef typename Traits::ResScalar ResScalar;
1117
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1118
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1119
+ typedef typename SwappedTraits::ResPacket SResPacket;
1120
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1121
+
1122
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1123
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1124
+ ResScalar alpha, SAccPacket &C0)
1125
+ {
1126
+ EIGEN_UNUSED_VARIABLE(res);
1127
+ EIGEN_UNUSED_VARIABLE(straits);
1128
+ EIGEN_UNUSED_VARIABLE(blA);
1129
+ EIGEN_UNUSED_VARIABLE(blB);
1130
+ EIGEN_UNUSED_VARIABLE(depth);
1131
+ EIGEN_UNUSED_VARIABLE(endk);
1132
+ EIGEN_UNUSED_VARIABLE(i);
1133
+ EIGEN_UNUSED_VARIABLE(j2);
1134
+ EIGEN_UNUSED_VARIABLE(alpha);
1135
+ EIGEN_UNUSED_VARIABLE(C0);
1136
+ }
1137
+ };
1138
+
1139
+
1140
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1141
+ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1142
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1143
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1144
+
1145
+ typedef typename Traits::ResScalar ResScalar;
1146
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1147
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1148
+ typedef typename SwappedTraits::ResPacket SResPacket;
1149
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1150
+
1151
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1152
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1153
+ ResScalar alpha, SAccPacket &C0)
1154
+ {
1155
+ typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1156
+ typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1157
+ typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1158
+ typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1159
+
1160
+ SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161
+ SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1162
+
1163
+ if (depth - endk > 0)
1164
+ {
1165
+ // We have to handle the last row(s) of the rhs, which
1166
+ // correspond to a half-packet
1167
+ SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1168
+
1169
+ for (Index kk = endk; kk < depth; kk++)
1170
+ {
1171
+ SLhsPacketQuarter a0;
1172
+ SRhsPacketQuarter b0;
1173
+ straits.loadLhsUnaligned(blB, a0);
1174
+ straits.loadRhs(blA, b0);
1175
+ straits.madd(a0,b0,c0,b0, fix<0>);
1176
+ blB += SwappedTraits::LhsProgress/4;
1177
+ blA += 1;
1178
+ }
1179
+ straits.acc(c0, alphav, R);
1180
+ }
1181
+ else
1182
+ {
1183
+ straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1184
+ }
1185
+ res.scatterPacket(i, j2, R);
1186
+ }
1187
+ };
1188
+
1189
+ template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1190
+ struct lhs_process_one_packet
1191
+ {
1192
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1193
+
1194
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1195
+ {
1196
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1197
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1198
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205
+ __asm__ ("" : "+x,m" (*A0));
1206
+ #endif
1207
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1208
+ }
1209
+
1210
+ EIGEN_STRONG_INLINE void operator()(
1211
+ const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1212
+ Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213
+ int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1214
+ {
1215
+ GEBPTraits traits;
1216
+
1217
+ // loops on each largest micro horizontal panel of lhs
1218
+ // (LhsProgress x depth)
1219
+ for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1220
+ {
1221
+ // loops on each largest micro vertical panel of rhs (depth * nr)
1222
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
1223
+ {
1224
+ // We select a LhsProgress x nr micro block of res
1225
+ // which is entirely stored into 1 x nr registers.
1226
+
1227
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1228
+ prefetch(&blA[0]);
1229
+
1230
+ // gets res block as register
1231
+ AccPacket C0, C1, C2, C3;
1232
+ traits.initAcc(C0);
1233
+ traits.initAcc(C1);
1234
+ traits.initAcc(C2);
1235
+ traits.initAcc(C3);
1236
+ // To improve instruction pipelining, let's double the accumulation registers:
1237
+ // even k will accumulate in C*, while odd k will accumulate in D*.
1238
+ // This trick is crutial to get good performance with FMA, otherwise it is
1239
+ // actually faster to perform separated MUL+ADD because of a naturally
1240
+ // better instruction-level parallelism.
1241
+ AccPacket D0, D1, D2, D3;
1242
+ traits.initAcc(D0);
1243
+ traits.initAcc(D1);
1244
+ traits.initAcc(D2);
1245
+ traits.initAcc(D3);
1246
+
1247
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1251
+
1252
+ r0.prefetch(prefetch_res_offset);
1253
+ r1.prefetch(prefetch_res_offset);
1254
+ r2.prefetch(prefetch_res_offset);
1255
+ r3.prefetch(prefetch_res_offset);
1256
+
1257
+ // performs "inner" products
1258
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1259
+ prefetch(&blB[0]);
1260
+ LhsPacket A0, A1;
1261
+
1262
+ for(Index k=0; k<peeled_kc; k+=pk)
1263
+ {
1264
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1265
+ RhsPacketx4 rhs_panel;
1266
+ RhsPacket T0;
1267
+
1268
+ internal::prefetch(blB+(48+0));
1269
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270
+ peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272
+ peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273
+ internal::prefetch(blB+(48+16));
1274
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275
+ peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277
+ peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1278
+
1279
+ blB += pk*4*RhsProgress;
1280
+ blA += pk*LhsProgress;
1281
+
1282
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1283
+ }
1284
+ C0 = padd(C0,D0);
1285
+ C1 = padd(C1,D1);
1286
+ C2 = padd(C2,D2);
1287
+ C3 = padd(C3,D3);
1288
+
1289
+ // process remaining peeled loop
1290
+ for(Index k=peeled_kc; k<depth; k++)
1291
+ {
1292
+ RhsPacketx4 rhs_panel;
1293
+ RhsPacket T0;
1294
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295
+ blB += 4*RhsProgress;
1296
+ blA += LhsProgress;
1297
+ }
1298
+
1299
+ ResPacket R0, R1;
1300
+ ResPacket alphav = pset1<ResPacket>(alpha);
1301
+
1302
+ R0 = r0.template loadPacket<ResPacket>(0);
1303
+ R1 = r1.template loadPacket<ResPacket>(0);
1304
+ traits.acc(C0, alphav, R0);
1305
+ traits.acc(C1, alphav, R1);
1306
+ r0.storePacket(0, R0);
1307
+ r1.storePacket(0, R1);
1308
+
1309
+ R0 = r2.template loadPacket<ResPacket>(0);
1310
+ R1 = r3.template loadPacket<ResPacket>(0);
1311
+ traits.acc(C2, alphav, R0);
1312
+ traits.acc(C3, alphav, R1);
1313
+ r2.storePacket(0, R0);
1314
+ r3.storePacket(0, R1);
1315
+ }
1316
+
1317
+ // Deal with remaining columns of the rhs
1318
+ for(Index j2=packet_cols4; j2<cols; j2++)
1319
+ {
1320
+ // One column at a time
1321
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1322
+ prefetch(&blA[0]);
1323
+
1324
+ // gets res block as register
1325
+ AccPacket C0;
1326
+ traits.initAcc(C0);
1327
+
1328
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1329
+
1330
+ // performs "inner" products
1331
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1332
+ LhsPacket A0;
1333
+
1334
+ for(Index k= 0; k<peeled_kc; k+=pk)
1335
+ {
1336
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1337
+ RhsPacket B_0;
1338
+
1339
+ #define EIGEN_GEBGP_ONESTEP(K) \
1340
+ do { \
1341
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1343
+ /* FIXME: why unaligned???? */ \
1344
+ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1348
+ } while(false);
1349
+
1350
+ EIGEN_GEBGP_ONESTEP(0);
1351
+ EIGEN_GEBGP_ONESTEP(1);
1352
+ EIGEN_GEBGP_ONESTEP(2);
1353
+ EIGEN_GEBGP_ONESTEP(3);
1354
+ EIGEN_GEBGP_ONESTEP(4);
1355
+ EIGEN_GEBGP_ONESTEP(5);
1356
+ EIGEN_GEBGP_ONESTEP(6);
1357
+ EIGEN_GEBGP_ONESTEP(7);
1358
+
1359
+ blB += pk*RhsProgress;
1360
+ blA += pk*LhsProgress;
1361
+
1362
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1363
+ }
1364
+
1365
+ // process remaining peeled loop
1366
+ for(Index k=peeled_kc; k<depth; k++)
1367
+ {
1368
+ RhsPacket B_0;
1369
+ EIGEN_GEBGP_ONESTEP(0);
1370
+ blB += RhsProgress;
1371
+ blA += LhsProgress;
1372
+ }
1373
+ #undef EIGEN_GEBGP_ONESTEP
1374
+ ResPacket R0;
1375
+ ResPacket alphav = pset1<ResPacket>(alpha);
1376
+ R0 = r0.template loadPacket<ResPacket>(0);
1377
+ traits.acc(C0, alphav, R0);
1378
+ r0.storePacket(0, R0);
1379
+ }
1380
+ }
1381
+ }
1382
+ };
1383
+
1384
+ template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1385
+ struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1386
+ {
1387
+
1388
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1389
+ {
1390
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1391
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1392
+ traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394
+ traits.madd(*A0, *B_0, *C0, *B_0);
1395
+ traits.madd(*A0, *B1, *C1, *B1);
1396
+ traits.madd(*A0, *B2, *C2, *B2);
1397
+ traits.madd(*A0, *B3, *C3, *B3);
1398
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1399
+ }
1400
+ };
1401
+
890
1402
  template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
891
1403
  EIGEN_DONT_INLINE
892
1404
  void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@@ -903,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
903
1415
  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904
1416
  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905
1417
  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906
- const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
1418
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419
+ const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420
+ const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
907
1421
  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
908
1422
  const Index peeled_kc = depth & ~(pk-1);
909
- const Index prefetch_res_offset = 32/sizeof(ResScalar);
1423
+ const int prefetch_res_offset = 32/sizeof(ResScalar);
910
1424
  // const Index depth2 = depth & ~1;
911
1425
 
912
1426
  //---------- Process 3 * LhsProgress rows at once ----------
@@ -964,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
964
1478
  for(Index k=0; k<peeled_kc; k+=pk)
965
1479
  {
966
1480
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
967
- RhsPacket B_0, T0;
1481
+ // 15 registers are taken (12 for acc, 2 for lhs).
1482
+ RhsPanel15 rhs_panel;
1483
+ RhsPacket T0;
968
1484
  LhsPacket A2;
969
-
970
- #define EIGEN_GEBP_ONESTEP(K) \
971
- do { \
972
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1485
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1486
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1487
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1488
+ // which is not good for pipelining
1489
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1490
+ #else
1491
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1492
+ #endif
1493
+ #define EIGEN_GEBP_ONESTEP(K) \
1494
+ do { \
1495
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973
1496
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
975
- if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
976
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979
- traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980
- traits.madd(A0, B_0, C0, T0); \
981
- traits.madd(A1, B_0, C4, T0); \
982
- traits.madd(A2, B_0, C8, B_0); \
983
- traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984
- traits.madd(A0, B_0, C1, T0); \
985
- traits.madd(A1, B_0, C5, T0); \
986
- traits.madd(A2, B_0, C9, B_0); \
987
- traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988
- traits.madd(A0, B_0, C2, T0); \
989
- traits.madd(A1, B_0, C6, T0); \
990
- traits.madd(A2, B_0, C10, B_0); \
991
- traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992
- traits.madd(A0, B_0, C3 , T0); \
993
- traits.madd(A1, B_0, C7, T0); \
994
- traits.madd(A2, B_0, C11, B_0); \
995
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
996
- } while(false)
1497
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1500
+ } /* Bug 953 */ \
1501
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505
+ traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509
+ traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513
+ traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517
+ traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1522
+ } while (false)
997
1523
 
998
1524
  internal::prefetch(blB);
999
1525
  EIGEN_GEBP_ONESTEP(0);
@@ -1013,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1013
1539
  // process remaining peeled loop
1014
1540
  for(Index k=peeled_kc; k<depth; k++)
1015
1541
  {
1016
- RhsPacket B_0, T0;
1542
+ RhsPanel15 rhs_panel;
1543
+ RhsPacket T0;
1017
1544
  LhsPacket A2;
1018
1545
  EIGEN_GEBP_ONESTEP(0);
1019
1546
  blB += 4*RhsProgress;
@@ -1025,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1025
1552
  ResPacket R0, R1, R2;
1026
1553
  ResPacket alphav = pset1<ResPacket>(alpha);
1027
1554
 
1028
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1555
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1031
1558
  traits.acc(C0, alphav, R0);
1032
1559
  traits.acc(C4, alphav, R1);
1033
1560
  traits.acc(C8, alphav, R2);
@@ -1035,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1035
1562
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1036
1563
  r0.storePacket(2 * Traits::ResPacketSize, R2);
1037
1564
 
1038
- R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039
- R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040
- R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1565
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1041
1568
  traits.acc(C1, alphav, R0);
1042
1569
  traits.acc(C5, alphav, R1);
1043
1570
  traits.acc(C9, alphav, R2);
@@ -1045,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1045
1572
  r1.storePacket(1 * Traits::ResPacketSize, R1);
1046
1573
  r1.storePacket(2 * Traits::ResPacketSize, R2);
1047
1574
 
1048
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050
- R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1575
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1051
1578
  traits.acc(C2, alphav, R0);
1052
1579
  traits.acc(C6, alphav, R1);
1053
1580
  traits.acc(C10, alphav, R2);
@@ -1055,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1055
1582
  r2.storePacket(1 * Traits::ResPacketSize, R1);
1056
1583
  r2.storePacket(2 * Traits::ResPacketSize, R2);
1057
1584
 
1058
- R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059
- R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060
- R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1585
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1061
1588
  traits.acc(C3, alphav, R0);
1062
1589
  traits.acc(C7, alphav, R1);
1063
1590
  traits.acc(C11, alphav, R2);
@@ -1093,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1093
1620
  {
1094
1621
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1095
1622
  RhsPacket B_0;
1096
- #define EIGEN_GEBGP_ONESTEP(K) \
1097
- do { \
1098
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1623
+ #define EIGEN_GEBGP_ONESTEP(K) \
1624
+ do { \
1625
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099
1626
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104
- traits.madd(A0, B_0, C0, B_0); \
1105
- traits.madd(A1, B_0, C4, B_0); \
1106
- traits.madd(A2, B_0, C8, B_0); \
1107
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1108
- } while(false)
1109
-
1627
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1635
+ } while (false)
1636
+
1110
1637
  EIGEN_GEBGP_ONESTEP(0);
1111
1638
  EIGEN_GEBGP_ONESTEP(1);
1112
1639
  EIGEN_GEBGP_ONESTEP(2);
@@ -1116,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1116
1643
  EIGEN_GEBGP_ONESTEP(6);
1117
1644
  EIGEN_GEBGP_ONESTEP(7);
1118
1645
 
1119
- blB += pk*RhsProgress;
1120
- blA += pk*3*Traits::LhsProgress;
1646
+ blB += int(pk) * int(RhsProgress);
1647
+ blA += int(pk) * 3 * int(Traits::LhsProgress);
1121
1648
 
1122
1649
  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1123
1650
  }
@@ -1134,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1134
1661
  ResPacket R0, R1, R2;
1135
1662
  ResPacket alphav = pset1<ResPacket>(alpha);
1136
1663
 
1137
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1664
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1140
1667
  traits.acc(C0, alphav, R0);
1141
1668
  traits.acc(C4, alphav, R1);
1142
1669
  traits.acc(C8, alphav, R2);
@@ -1195,7 +1722,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1195
1722
  for(Index k=0; k<peeled_kc; k+=pk)
1196
1723
  {
1197
1724
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1198
- RhsPacket B_0, B1, B2, B3, T0;
1725
+ RhsPacketx4 rhs_panel;
1726
+ RhsPacket T0;
1199
1727
 
1200
1728
  // NOTE: the begin/end asm comments below work around bug 935!
1201
1729
  // but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1204,24 +1732,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1204
1732
  #else
1205
1733
  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1206
1734
  #endif
1207
- #define EIGEN_GEBGP_ONESTEP(K) \
1208
- do { \
1209
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213
- traits.madd(A0, B_0, C0, T0); \
1214
- traits.madd(A1, B_0, C4, B_0); \
1215
- traits.madd(A0, B1, C1, T0); \
1216
- traits.madd(A1, B1, C5, B1); \
1217
- traits.madd(A0, B2, C2, T0); \
1218
- traits.madd(A1, B2, C6, B2); \
1219
- traits.madd(A0, B3, C3, T0); \
1220
- traits.madd(A1, B3, C7, B3); \
1221
- EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1223
- } while(false)
1224
-
1735
+ #define EIGEN_GEBGP_ONESTEP(K) \
1736
+ do { \
1737
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1751
+ } while (false)
1752
+
1225
1753
  internal::prefetch(blB+(48+0));
1226
1754
  EIGEN_GEBGP_ONESTEP(0);
1227
1755
  EIGEN_GEBGP_ONESTEP(1);
@@ -1241,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1241
1769
  // process remaining peeled loop
1242
1770
  for(Index k=peeled_kc; k<depth; k++)
1243
1771
  {
1244
- RhsPacket B_0, B1, B2, B3, T0;
1772
+ RhsPacketx4 rhs_panel;
1773
+ RhsPacket T0;
1245
1774
  EIGEN_GEBGP_ONESTEP(0);
1246
1775
  blB += 4*RhsProgress;
1247
1776
  blA += 2*Traits::LhsProgress;
@@ -1251,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1251
1780
  ResPacket R0, R1, R2, R3;
1252
1781
  ResPacket alphav = pset1<ResPacket>(alpha);
1253
1782
 
1254
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256
- R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257
- R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1783
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1258
1787
  traits.acc(C0, alphav, R0);
1259
1788
  traits.acc(C4, alphav, R1);
1260
1789
  traits.acc(C1, alphav, R2);
@@ -1264,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1264
1793
  r1.storePacket(0 * Traits::ResPacketSize, R2);
1265
1794
  r1.storePacket(1 * Traits::ResPacketSize, R3);
1266
1795
 
1267
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269
- R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270
- R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1796
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1271
1800
  traits.acc(C2, alphav, R0);
1272
1801
  traits.acc(C6, alphav, R1);
1273
1802
  traits.acc(C3, alphav, R2);
@@ -1312,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1312
1841
  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313
1842
  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314
1843
  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315
- traits.madd(A0, B_0, C0, B1); \
1316
- traits.madd(A1, B_0, C4, B_0); \
1844
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
1845
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1317
1846
  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1318
1847
  } while(false)
1319
1848
 
@@ -1326,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1326
1855
  EIGEN_GEBGP_ONESTEP(6);
1327
1856
  EIGEN_GEBGP_ONESTEP(7);
1328
1857
 
1329
- blB += pk*RhsProgress;
1330
- blA += pk*2*Traits::LhsProgress;
1858
+ blB += int(pk) * int(RhsProgress);
1859
+ blA += int(pk) * 2 * int(Traits::LhsProgress);
1331
1860
 
1332
1861
  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1333
1862
  }
@@ -1344,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1344
1873
  ResPacket R0, R1;
1345
1874
  ResPacket alphav = pset1<ResPacket>(alpha);
1346
1875
 
1347
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1876
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1349
1878
  traits.acc(C0, alphav, R0);
1350
1879
  traits.acc(C4, alphav, R1);
1351
1880
  r0.storePacket(0 * Traits::ResPacketSize, R0);
@@ -1357,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1357
1886
  //---------- Process 1 * LhsProgress rows at once ----------
1358
1887
  if(mr>=1*Traits::LhsProgress)
1359
1888
  {
1360
- // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1361
- for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1362
- {
1363
- // loops on each largest micro vertical panel of rhs (depth * nr)
1364
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1365
- {
1366
- // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1367
- // stored into 1 x nr registers.
1368
-
1369
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1370
- prefetch(&blA[0]);
1371
-
1372
- // gets res block as register
1373
- AccPacket C0, C1, C2, C3;
1374
- traits.initAcc(C0);
1375
- traits.initAcc(C1);
1376
- traits.initAcc(C2);
1377
- traits.initAcc(C3);
1378
-
1379
- LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1380
- LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1381
- LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1382
- LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1383
-
1384
- r0.prefetch(prefetch_res_offset);
1385
- r1.prefetch(prefetch_res_offset);
1386
- r2.prefetch(prefetch_res_offset);
1387
- r3.prefetch(prefetch_res_offset);
1388
-
1389
- // performs "inner" products
1390
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1391
- prefetch(&blB[0]);
1392
- LhsPacket A0;
1393
-
1394
- for(Index k=0; k<peeled_kc; k+=pk)
1395
- {
1396
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1397
- RhsPacket B_0, B1, B2, B3;
1398
-
1399
- #define EIGEN_GEBGP_ONESTEP(K) \
1400
- do { \
1401
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405
- traits.madd(A0, B_0, C0, B_0); \
1406
- traits.madd(A0, B1, C1, B1); \
1407
- traits.madd(A0, B2, C2, B2); \
1408
- traits.madd(A0, B3, C3, B3); \
1409
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1410
- } while(false)
1411
-
1412
- internal::prefetch(blB+(48+0));
1413
- EIGEN_GEBGP_ONESTEP(0);
1414
- EIGEN_GEBGP_ONESTEP(1);
1415
- EIGEN_GEBGP_ONESTEP(2);
1416
- EIGEN_GEBGP_ONESTEP(3);
1417
- internal::prefetch(blB+(48+16));
1418
- EIGEN_GEBGP_ONESTEP(4);
1419
- EIGEN_GEBGP_ONESTEP(5);
1420
- EIGEN_GEBGP_ONESTEP(6);
1421
- EIGEN_GEBGP_ONESTEP(7);
1422
-
1423
- blB += pk*4*RhsProgress;
1424
- blA += pk*1*LhsProgress;
1425
-
1426
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1427
- }
1428
- // process remaining peeled loop
1429
- for(Index k=peeled_kc; k<depth; k++)
1430
- {
1431
- RhsPacket B_0, B1, B2, B3;
1432
- EIGEN_GEBGP_ONESTEP(0);
1433
- blB += 4*RhsProgress;
1434
- blA += 1*LhsProgress;
1435
- }
1436
- #undef EIGEN_GEBGP_ONESTEP
1437
-
1438
- ResPacket R0, R1;
1439
- ResPacket alphav = pset1<ResPacket>(alpha);
1440
-
1441
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442
- R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443
- traits.acc(C0, alphav, R0);
1444
- traits.acc(C1, alphav, R1);
1445
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1446
- r1.storePacket(0 * Traits::ResPacketSize, R1);
1447
-
1448
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449
- R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450
- traits.acc(C2, alphav, R0);
1451
- traits.acc(C3, alphav, R1);
1452
- r2.storePacket(0 * Traits::ResPacketSize, R0);
1453
- r3.storePacket(0 * Traits::ResPacketSize, R1);
1454
- }
1455
-
1456
- // Deal with remaining columns of the rhs
1457
- for(Index j2=packet_cols4; j2<cols; j2++)
1458
- {
1459
- // One column at a time
1460
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1461
- prefetch(&blA[0]);
1462
-
1463
- // gets res block as register
1464
- AccPacket C0;
1465
- traits.initAcc(C0);
1466
-
1467
- LinearMapper r0 = res.getLinearMapper(i, j2);
1468
-
1469
- // performs "inner" products
1470
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1471
- LhsPacket A0;
1472
-
1473
- for(Index k=0; k<peeled_kc; k+=pk)
1474
- {
1475
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1476
- RhsPacket B_0;
1477
-
1478
- #define EIGEN_GEBGP_ONESTEP(K) \
1479
- do { \
1480
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484
- traits.madd(A0, B_0, C0, B_0); \
1485
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1486
- } while(false);
1487
-
1488
- EIGEN_GEBGP_ONESTEP(0);
1489
- EIGEN_GEBGP_ONESTEP(1);
1490
- EIGEN_GEBGP_ONESTEP(2);
1491
- EIGEN_GEBGP_ONESTEP(3);
1492
- EIGEN_GEBGP_ONESTEP(4);
1493
- EIGEN_GEBGP_ONESTEP(5);
1494
- EIGEN_GEBGP_ONESTEP(6);
1495
- EIGEN_GEBGP_ONESTEP(7);
1496
-
1497
- blB += pk*RhsProgress;
1498
- blA += pk*1*Traits::LhsProgress;
1499
-
1500
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1501
- }
1502
-
1503
- // process remaining peeled loop
1504
- for(Index k=peeled_kc; k<depth; k++)
1505
- {
1506
- RhsPacket B_0;
1507
- EIGEN_GEBGP_ONESTEP(0);
1508
- blB += RhsProgress;
1509
- blA += 1*Traits::LhsProgress;
1510
- }
1511
- #undef EIGEN_GEBGP_ONESTEP
1512
- ResPacket R0;
1513
- ResPacket alphav = pset1<ResPacket>(alpha);
1514
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515
- traits.acc(C0, alphav, R0);
1516
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1517
- }
1518
- }
1889
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1891
+ }
1892
+ //---------- Process LhsProgressHalf rows at once ----------
1893
+ if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1894
+ {
1895
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1897
+ }
1898
+ //---------- Process LhsProgressQuarter rows at once ----------
1899
+ if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1900
+ {
1901
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1519
1903
  }
1520
1904
  //---------- Process remaining rows, 1 at once ----------
1521
- if(peeled_mc1<rows)
1905
+ if(peeled_mc_quarter<rows)
1522
1906
  {
1523
1907
  // loop on each panel of the rhs
1524
1908
  for(Index j2=0; j2<packet_cols4; j2+=nr)
1525
1909
  {
1526
1910
  // loop on each row of the lhs (1*LhsProgress x depth)
1527
- for(Index i=peeled_mc1; i<rows; i+=1)
1911
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
1528
1912
  {
1529
1913
  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1530
1914
  prefetch(&blA[0]);
1531
1915
  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1532
1916
 
1533
- // The following piece of code wont work for 512 bit registers
1534
- // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1535
- // as nr (which is currently 4) for the return type.
1917
+ // If LhsProgress is 8 or 16, it assumes that there is a
1918
+ // half or quarter packet, respectively, of the same size as
1919
+ // nr (which is currently 4) for the return type.
1536
1920
  const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1921
+ const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
1537
1922
  if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538
- (SwappedTraits::LhsProgress <= 8) &&
1539
- (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1923
+ (SwappedTraits::LhsProgress<=16) &&
1924
+ (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925
+ (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1540
1926
  {
1541
1927
  SAccPacket C0, C1, C2, C3;
1542
1928
  straits.initAcc(C0);
@@ -1559,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1559
1945
 
1560
1946
  straits.loadRhsQuad(blA+0*spk, B_0);
1561
1947
  straits.loadRhsQuad(blA+1*spk, B_1);
1562
- straits.madd(A0,B_0,C0,B_0);
1563
- straits.madd(A1,B_1,C1,B_1);
1948
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
1949
+ straits.madd(A1,B_1,C1,B_1, fix<0>);
1564
1950
 
1565
1951
  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1566
1952
  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1567
1953
  straits.loadRhsQuad(blA+2*spk, B_0);
1568
1954
  straits.loadRhsQuad(blA+3*spk, B_1);
1569
- straits.madd(A0,B_0,C2,B_0);
1570
- straits.madd(A1,B_1,C3,B_1);
1955
+ straits.madd(A0,B_0,C2,B_0, fix<0>);
1956
+ straits.madd(A1,B_1,C3,B_1, fix<0>);
1571
1957
 
1572
1958
  blB += 4*SwappedTraits::LhsProgress;
1573
1959
  blA += 4*spk;
@@ -1580,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1580
1966
 
1581
1967
  straits.loadLhsUnaligned(blB, A0);
1582
1968
  straits.loadRhsQuad(blA, B_0);
1583
- straits.madd(A0,B_0,C0,B_0);
1969
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
1584
1970
 
1585
1971
  blB += SwappedTraits::LhsProgress;
1586
1972
  blA += spk;
@@ -1590,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1590
1976
  // Special case where we have to first reduce the accumulation register C0
1591
1977
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1592
1978
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1593
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1979
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1594
1980
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1595
1981
 
1596
1982
  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
@@ -1603,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1603
1989
  SRhsPacketHalf b0;
1604
1990
  straits.loadLhsUnaligned(blB, a0);
1605
1991
  straits.loadRhs(blA, b0);
1606
- SAccPacketHalf c0 = predux_downto4(C0);
1607
- straits.madd(a0,b0,c0,b0);
1992
+ SAccPacketHalf c0 = predux_half_dowto4(C0);
1993
+ straits.madd(a0,b0,c0,b0, fix<0>);
1608
1994
  straits.acc(c0, alphav, R);
1609
1995
  }
1610
1996
  else
1611
1997
  {
1612
- straits.acc(predux_downto4(C0), alphav, R);
1998
+ straits.acc(predux_half_dowto4(C0), alphav, R);
1613
1999
  }
1614
2000
  res.scatterPacket(i, j2, R);
1615
2001
  }
2002
+ else if (SwappedTraits::LhsProgress==16)
2003
+ {
2004
+ // Special case where we have to first reduce the
2005
+ // accumulation register C0. We specialize the block in
2006
+ // template form, so that LhsProgress < 16 paths don't
2007
+ // fail to compile
2008
+ last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2009
+ p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2010
+ }
1616
2011
  else
1617
2012
  {
1618
2013
  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
@@ -1635,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1635
2030
 
1636
2031
  B_0 = blB[0];
1637
2032
  B_1 = blB[1];
1638
- CJMADD(cj,A0,B_0,C0, B_0);
1639
- CJMADD(cj,A0,B_1,C1, B_1);
1640
-
2033
+ C0 = cj.pmadd(A0,B_0,C0);
2034
+ C1 = cj.pmadd(A0,B_1,C1);
2035
+
1641
2036
  B_0 = blB[2];
1642
2037
  B_1 = blB[3];
1643
- CJMADD(cj,A0,B_0,C2, B_0);
1644
- CJMADD(cj,A0,B_1,C3, B_1);
1645
-
2038
+ C2 = cj.pmadd(A0,B_0,C2);
2039
+ C3 = cj.pmadd(A0,B_1,C3);
2040
+
1646
2041
  blB += 4;
1647
2042
  }
1648
2043
  res(i, j2 + 0) += alpha * C0;
@@ -1656,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1656
2051
  for(Index j2=packet_cols4; j2<cols; j2++)
1657
2052
  {
1658
2053
  // loop on each row of the lhs (1*LhsProgress x depth)
1659
- for(Index i=peeled_mc1; i<rows; i+=1)
2054
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
1660
2055
  {
1661
2056
  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1662
2057
  prefetch(&blA[0]);
@@ -1667,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1667
2062
  {
1668
2063
  LhsScalar A0 = blA[k];
1669
2064
  RhsScalar B_0 = blB[k];
1670
- CJMADD(cj, A0, B_0, C0, B_0);
2065
+ C0 = cj.pmadd(A0, B_0, C0);
1671
2066
  }
1672
2067
  res(i, j2) += alpha * C0;
1673
2068
  }
@@ -1676,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1676
2071
  }
1677
2072
 
1678
2073
 
1679
- #undef CJMADD
1680
-
1681
2074
  // pack a block of the lhs
1682
2075
  // The traversal is as follow (mr==4):
1683
2076
  // 0 4 8 12 ...
@@ -1692,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1692
2085
  //
1693
2086
  // 32 33 34 35 ...
1694
2087
  // 36 36 38 39 ...
1695
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1696
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
2088
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2089
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
1697
2090
  {
1698
2091
  typedef typename DataMapper::LinearMapper LinearMapper;
1699
2092
  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1700
2093
  };
1701
2094
 
1702
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1703
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
2095
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2096
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
1704
2097
  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1705
2098
  {
1706
- typedef typename packet_traits<Scalar>::type Packet;
1707
- enum { PacketSize = packet_traits<Scalar>::size };
2099
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2100
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101
+ enum { PacketSize = unpacket_traits<Packet>::size,
2102
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2105
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
1708
2106
 
1709
2107
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1710
2108
  EIGEN_UNUSED_VARIABLE(stride);
@@ -1716,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1716
2114
 
1717
2115
  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718
2116
  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720
- const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721
- : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
2117
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120
+ const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121
+ const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122
+ : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
1722
2123
 
1723
2124
  Index i=0;
1724
2125
 
@@ -1732,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1732
2133
  for(Index k=0; k<depth; k++)
1733
2134
  {
1734
2135
  Packet A, B, C;
1735
- A = lhs.loadPacket(i+0*PacketSize, k);
1736
- B = lhs.loadPacket(i+1*PacketSize, k);
1737
- C = lhs.loadPacket(i+2*PacketSize, k);
2136
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138
+ C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
1738
2139
  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1739
2140
  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1740
2141
  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1752,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1752
2153
  for(Index k=0; k<depth; k++)
1753
2154
  {
1754
2155
  Packet A, B;
1755
- A = lhs.loadPacket(i+0*PacketSize, k);
1756
- B = lhs.loadPacket(i+1*PacketSize, k);
2156
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
1757
2158
  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1758
2159
  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1759
2160
  }
@@ -1770,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1770
2171
  for(Index k=0; k<depth; k++)
1771
2172
  {
1772
2173
  Packet A;
1773
- A = lhs.loadPacket(i+0*PacketSize, k);
2174
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
1774
2175
  pstore(blockA+count, cj.pconj(A));
1775
2176
  count+=PacketSize;
1776
2177
  }
1777
2178
  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1778
2179
  }
1779
2180
  }
1780
- // Pack scalars
2181
+ // Pack half packets
2182
+ if(HasHalf && Pack1>=HalfPacketSize)
2183
+ {
2184
+ for(; i<peeled_mc_half; i+=HalfPacketSize)
2185
+ {
2186
+ if(PanelMode) count += (HalfPacketSize) * offset;
2187
+
2188
+ for(Index k=0; k<depth; k++)
2189
+ {
2190
+ HalfPacket A;
2191
+ A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192
+ pstoreu(blockA+count, cj.pconj(A));
2193
+ count+=HalfPacketSize;
2194
+ }
2195
+ if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2196
+ }
2197
+ }
2198
+ // Pack quarter packets
2199
+ if(HasQuarter && Pack1>=QuarterPacketSize)
2200
+ {
2201
+ for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2202
+ {
2203
+ if(PanelMode) count += (QuarterPacketSize) * offset;
2204
+
2205
+ for(Index k=0; k<depth; k++)
2206
+ {
2207
+ QuarterPacket A;
2208
+ A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209
+ pstoreu(blockA+count, cj.pconj(A));
2210
+ count+=QuarterPacketSize;
2211
+ }
2212
+ if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2213
+ }
2214
+ }
2215
+ // Pack2 may be *smaller* than PacketSize—that happens for
2216
+ // products like real * complex, where we have to go half the
2217
+ // progress on the lhs in order to duplicate those operands to
2218
+ // address both real & imaginary parts on the rhs. This portion will
2219
+ // pack those half ones until they match the number expected on the
2220
+ // last peeling loop at this point (for the rhs).
1781
2221
  if(Pack2<PacketSize && Pack2>1)
1782
2222
  {
1783
- for(; i<peeled_mc0; i+=Pack2)
2223
+ for(; i<peeled_mc0; i+=last_lhs_progress)
1784
2224
  {
1785
- if(PanelMode) count += Pack2 * offset;
2225
+ if(PanelMode) count += last_lhs_progress * offset;
1786
2226
 
1787
2227
  for(Index k=0; k<depth; k++)
1788
- for(Index w=0; w<Pack2; w++)
2228
+ for(Index w=0; w<last_lhs_progress; w++)
1789
2229
  blockA[count++] = cj(lhs(i+w, k));
1790
2230
 
1791
- if(PanelMode) count += Pack2 * (stride-offset-depth);
2231
+ if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
1792
2232
  }
1793
2233
  }
2234
+ // Pack scalars
1794
2235
  for(; i<rows; i++)
1795
2236
  {
1796
2237
  if(PanelMode) count += offset;
@@ -1800,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1800
2241
  }
1801
2242
  }
1802
2243
 
1803
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1804
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
2244
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2245
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
1805
2246
  {
1806
2247
  typedef typename DataMapper::LinearMapper LinearMapper;
1807
2248
  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1808
2249
  };
1809
2250
 
1810
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1811
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
2251
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2252
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
1812
2253
  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1813
2254
  {
1814
- typedef typename packet_traits<Scalar>::type Packet;
1815
- enum { PacketSize = packet_traits<Scalar>::size };
2255
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2256
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257
+ enum { PacketSize = unpacket_traits<Packet>::size,
2258
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2261
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
1816
2262
 
1817
2263
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1818
2264
  EIGEN_UNUSED_VARIABLE(stride);
@@ -1820,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1820
2266
  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1821
2267
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1822
2268
  Index count = 0;
2269
+ bool gone_half = false, gone_quarter = false, gone_last = false;
1823
2270
 
1824
- // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1825
- // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1826
- // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1827
-
1828
- int pack = Pack1;
1829
2271
  Index i = 0;
2272
+ int pack = Pack1;
2273
+ int psize = PacketSize;
1830
2274
  while(pack>0)
1831
2275
  {
1832
2276
  Index remaining_rows = rows-i;
1833
- Index peeled_mc = i+(remaining_rows/pack)*pack;
2277
+ Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278
+ Index starting_pos = i;
1834
2279
  for(; i<peeled_mc; i+=pack)
1835
2280
  {
1836
2281
  if(PanelMode) count += pack * offset;
1837
2282
 
1838
- const Index peeled_k = (depth/PacketSize)*PacketSize;
1839
2283
  Index k=0;
1840
- if(pack>=PacketSize)
2284
+ if(pack>=psize && psize >= QuarterPacketSize)
1841
2285
  {
1842
- for(; k<peeled_k; k+=PacketSize)
2286
+ const Index peeled_k = (depth/psize)*psize;
2287
+ for(; k<peeled_k; k+=psize)
1843
2288
  {
1844
- for (Index m = 0; m < pack; m += PacketSize)
2289
+ for (Index m = 0; m < pack; m += psize)
1845
2290
  {
1846
- PacketBlock<Packet> kernel;
1847
- for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1848
- ptranspose(kernel);
1849
- for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2291
+ if (psize == PacketSize) {
2292
+ PacketBlock<Packet> kernel;
2293
+ for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2294
+ ptranspose(kernel);
2295
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296
+ } else if (HasHalf && psize == HalfPacketSize) {
2297
+ gone_half = true;
2298
+ PacketBlock<HalfPacket> kernel_half;
2299
+ for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2300
+ ptranspose(kernel_half);
2301
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302
+ } else if (HasQuarter && psize == QuarterPacketSize) {
2303
+ gone_quarter = true;
2304
+ PacketBlock<QuarterPacket> kernel_quarter;
2305
+ for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2306
+ ptranspose(kernel_quarter);
2307
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2308
+ }
1850
2309
  }
1851
- count += PacketSize*pack;
2310
+ count += psize*pack;
1852
2311
  }
1853
2312
  }
2313
+
1854
2314
  for(; k<depth; k++)
1855
2315
  {
1856
2316
  Index w=0;
@@ -1873,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1873
2333
  if(PanelMode) count += pack * (stride-offset-depth);
1874
2334
  }
1875
2335
 
1876
- pack -= PacketSize;
1877
- if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1878
- pack = Pack2;
2336
+ pack -= psize;
2337
+ Index left = rows - i;
2338
+ if (pack <= 0) {
2339
+ if (!gone_last &&
2340
+ (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341
+ ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342
+ (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2343
+ psize /= 2;
2344
+ pack = psize;
2345
+ continue;
2346
+ }
2347
+ // Pack2 may be *smaller* than PacketSize—that happens for
2348
+ // products like real * complex, where we have to go half the
2349
+ // progress on the lhs in order to duplicate those operands to
2350
+ // address both real & imaginary parts on the rhs. This portion will
2351
+ // pack those half ones until they match the number expected on the
2352
+ // last peeling loop at this point (for the rhs).
2353
+ if (Pack2 < PacketSize && !gone_last) {
2354
+ gone_last = true;
2355
+ psize = pack = left & ~1;
2356
+ }
2357
+ }
1879
2358
  }
1880
2359
 
1881
2360
  for(; i<rows; i++)
@@ -1931,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
1931
2410
  // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1932
2411
  // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1933
2412
  // Index k=0;
1934
- // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
2413
+ // if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
1935
2414
  // {
1936
2415
  // for(; k<peeled_k; k+=PacketSize) {
1937
2416
  // PacketBlock<Packet> kernel;
@@ -1978,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
1978
2457
  {
1979
2458
  for(; k<peeled_k; k+=PacketSize) {
1980
2459
  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1981
- kernel.packet[0] = dm0.loadPacket(k);
1982
- kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1983
- kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1984
- kernel.packet[3%PacketSize] = dm3.loadPacket(k);
2460
+ kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461
+ kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462
+ kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463
+ kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
1985
2464
  ptranspose(kernel);
1986
2465
  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1987
2466
  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@@ -2022,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
2022
2501
  struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2023
2502
  {
2024
2503
  typedef typename packet_traits<Scalar>::type Packet;
2504
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2505
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2025
2506
  typedef typename DataMapper::LinearMapper LinearMapper;
2026
- enum { PacketSize = packet_traits<Scalar>::size };
2027
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2028
- };
2029
-
2030
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2031
- EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2032
- ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2033
- {
2034
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2035
- EIGEN_UNUSED_VARIABLE(stride);
2036
- EIGEN_UNUSED_VARIABLE(offset);
2037
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2038
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2039
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2041
- Index count = 0;
2042
-
2043
- // if(nr>=8)
2044
- // {
2045
- // for(Index j2=0; j2<packet_cols8; j2+=8)
2046
- // {
2047
- // // skip what we have before
2048
- // if(PanelMode) count += 8 * offset;
2049
- // for(Index k=0; k<depth; k++)
2050
- // {
2051
- // if (PacketSize==8) {
2052
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2053
- // pstoreu(blockB+count, cj.pconj(A));
2054
- // } else if (PacketSize==4) {
2055
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2056
- // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2057
- // pstoreu(blockB+count, cj.pconj(A));
2058
- // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2059
- // } else {
2060
- // const Scalar* b0 = &rhs[k*rhsStride + j2];
2061
- // blockB[count+0] = cj(b0[0]);
2062
- // blockB[count+1] = cj(b0[1]);
2063
- // blockB[count+2] = cj(b0[2]);
2064
- // blockB[count+3] = cj(b0[3]);
2065
- // blockB[count+4] = cj(b0[4]);
2066
- // blockB[count+5] = cj(b0[5]);
2067
- // blockB[count+6] = cj(b0[6]);
2068
- // blockB[count+7] = cj(b0[7]);
2069
- // }
2070
- // count += 8;
2071
- // }
2072
- // // skip what we have after
2073
- // if(PanelMode) count += 8 * (stride-offset-depth);
2074
- // }
2075
- // }
2076
- if(nr>=4)
2507
+ enum { PacketSize = packet_traits<Scalar>::size,
2508
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2509
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
2510
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2077
2511
  {
2078
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2512
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2513
+ EIGEN_UNUSED_VARIABLE(stride);
2514
+ EIGEN_UNUSED_VARIABLE(offset);
2515
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516
+ const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2517
+ const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2518
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2519
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2521
+ Index count = 0;
2522
+
2523
+ // if(nr>=8)
2524
+ // {
2525
+ // for(Index j2=0; j2<packet_cols8; j2+=8)
2526
+ // {
2527
+ // // skip what we have before
2528
+ // if(PanelMode) count += 8 * offset;
2529
+ // for(Index k=0; k<depth; k++)
2530
+ // {
2531
+ // if (PacketSize==8) {
2532
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2533
+ // pstoreu(blockB+count, cj.pconj(A));
2534
+ // } else if (PacketSize==4) {
2535
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2536
+ // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2537
+ // pstoreu(blockB+count, cj.pconj(A));
2538
+ // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2539
+ // } else {
2540
+ // const Scalar* b0 = &rhs[k*rhsStride + j2];
2541
+ // blockB[count+0] = cj(b0[0]);
2542
+ // blockB[count+1] = cj(b0[1]);
2543
+ // blockB[count+2] = cj(b0[2]);
2544
+ // blockB[count+3] = cj(b0[3]);
2545
+ // blockB[count+4] = cj(b0[4]);
2546
+ // blockB[count+5] = cj(b0[5]);
2547
+ // blockB[count+6] = cj(b0[6]);
2548
+ // blockB[count+7] = cj(b0[7]);
2549
+ // }
2550
+ // count += 8;
2551
+ // }
2552
+ // // skip what we have after
2553
+ // if(PanelMode) count += 8 * (stride-offset-depth);
2554
+ // }
2555
+ // }
2556
+ if(nr>=4)
2079
2557
  {
2080
- // skip what we have before
2081
- if(PanelMode) count += 4 * offset;
2082
- for(Index k=0; k<depth; k++)
2558
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2083
2559
  {
2084
- if (PacketSize==4) {
2085
- Packet A = rhs.loadPacket(k, j2);
2086
- pstoreu(blockB+count, cj.pconj(A));
2087
- count += PacketSize;
2088
- } else {
2089
- const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2090
- blockB[count+0] = cj(dm0(0));
2091
- blockB[count+1] = cj(dm0(1));
2092
- blockB[count+2] = cj(dm0(2));
2093
- blockB[count+3] = cj(dm0(3));
2094
- count += 4;
2560
+ // skip what we have before
2561
+ if(PanelMode) count += 4 * offset;
2562
+ for(Index k=0; k<depth; k++)
2563
+ {
2564
+ if (PacketSize==4) {
2565
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
2566
+ pstoreu(blockB+count, cj.pconj(A));
2567
+ count += PacketSize;
2568
+ } else if (HasHalf && HalfPacketSize==4) {
2569
+ HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570
+ pstoreu(blockB+count, cj.pconj(A));
2571
+ count += HalfPacketSize;
2572
+ } else if (HasQuarter && QuarterPacketSize==4) {
2573
+ QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574
+ pstoreu(blockB+count, cj.pconj(A));
2575
+ count += QuarterPacketSize;
2576
+ } else {
2577
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578
+ blockB[count+0] = cj(dm0(0));
2579
+ blockB[count+1] = cj(dm0(1));
2580
+ blockB[count+2] = cj(dm0(2));
2581
+ blockB[count+3] = cj(dm0(3));
2582
+ count += 4;
2583
+ }
2095
2584
  }
2585
+ // skip what we have after
2586
+ if(PanelMode) count += 4 * (stride-offset-depth);
2096
2587
  }
2097
- // skip what we have after
2098
- if(PanelMode) count += 4 * (stride-offset-depth);
2099
2588
  }
2100
- }
2101
- // copy the remaining columns one at a time (nr==1)
2102
- for(Index j2=packet_cols4; j2<cols; ++j2)
2103
- {
2104
- if(PanelMode) count += offset;
2105
- for(Index k=0; k<depth; k++)
2589
+ // copy the remaining columns one at a time (nr==1)
2590
+ for(Index j2=packet_cols4; j2<cols; ++j2)
2106
2591
  {
2107
- blockB[count] = cj(rhs(k, j2));
2108
- count += 1;
2592
+ if(PanelMode) count += offset;
2593
+ for(Index k=0; k<depth; k++)
2594
+ {
2595
+ blockB[count] = cj(rhs(k, j2));
2596
+ count += 1;
2597
+ }
2598
+ if(PanelMode) count += stride-offset-depth;
2109
2599
  }
2110
- if(PanelMode) count += stride-offset-depth;
2111
2600
  }
2112
- }
2601
+ };
2113
2602
 
2114
2603
  } // end namespace internal
2115
2604