tomoto 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/ct.cpp +11 -11
  5. data/ext/tomoto/dmr.cpp +14 -13
  6. data/ext/tomoto/dt.cpp +14 -14
  7. data/ext/tomoto/extconf.rb +7 -5
  8. data/ext/tomoto/gdmr.cpp +7 -7
  9. data/ext/tomoto/hdp.cpp +9 -9
  10. data/ext/tomoto/hlda.cpp +13 -13
  11. data/ext/tomoto/hpa.cpp +5 -5
  12. data/ext/tomoto/lda.cpp +42 -39
  13. data/ext/tomoto/llda.cpp +6 -6
  14. data/ext/tomoto/mglda.cpp +15 -15
  15. data/ext/tomoto/pa.cpp +6 -6
  16. data/ext/tomoto/plda.cpp +6 -6
  17. data/ext/tomoto/slda.cpp +8 -8
  18. data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
  19. data/ext/tomoto/utils.h +16 -70
  20. data/lib/tomoto/version.rb +1 -1
  21. data/lib/tomoto.rb +5 -1
  22. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  23. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  24. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  25. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  26. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  27. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  28. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  29. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  30. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  31. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  32. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  33. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  34. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  35. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  36. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  37. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  38. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  39. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  40. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  41. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  42. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  43. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  44. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  45. data/vendor/EigenRand/README.md +57 -4
  46. data/vendor/eigen/COPYING.APACHE +203 -0
  47. data/vendor/eigen/COPYING.BSD +1 -1
  48. data/vendor/eigen/COPYING.MINPACK +51 -52
  49. data/vendor/eigen/Eigen/Cholesky +0 -1
  50. data/vendor/eigen/Eigen/Core +112 -265
  51. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  52. data/vendor/eigen/Eigen/Geometry +5 -8
  53. data/vendor/eigen/Eigen/Householder +0 -1
  54. data/vendor/eigen/Eigen/Jacobi +0 -1
  55. data/vendor/eigen/Eigen/KLUSupport +41 -0
  56. data/vendor/eigen/Eigen/LU +2 -5
  57. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  58. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  59. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  60. data/vendor/eigen/Eigen/QR +2 -3
  61. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  62. data/vendor/eigen/Eigen/SVD +0 -1
  63. data/vendor/eigen/Eigen/Sparse +0 -2
  64. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  65. data/vendor/eigen/Eigen/SparseLU +4 -0
  66. data/vendor/eigen/Eigen/SparseQR +0 -1
  67. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  68. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  69. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  70. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  71. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  72. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  73. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  74. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  75. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  76. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  77. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  78. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  79. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  80. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  81. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  82. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  84. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  85. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  86. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  87. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  88. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  89. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  90. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  91. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  92. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  93. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  94. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  95. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  96. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  97. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  98. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  99. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  100. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  101. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  102. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  103. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  104. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  105. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  106. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  107. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  108. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  109. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  110. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  111. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  112. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  113. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  114. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  115. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  116. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  117. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  118. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  119. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  120. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  121. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  122. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  123. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  124. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  125. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  126. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  127. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  128. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  129. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  130. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  131. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  132. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  133. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  134. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  135. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  136. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  137. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  138. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  139. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  140. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  141. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  142. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  143. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  145. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  146. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  148. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  149. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  153. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  154. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  156. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  160. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  161. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  162. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  169. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  171. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  172. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  173. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  174. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  175. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  176. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  177. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  178. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  179. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  180. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  181. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  182. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  183. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  184. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  185. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  186. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  187. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  188. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  189. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  190. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  191. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  192. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  193. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  194. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  195. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  196. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  197. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  198. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  199. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  200. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  201. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  202. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  203. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  204. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  205. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  206. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  207. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  208. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  209. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  210. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  211. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  212. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  213. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  214. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  215. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  216. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  217. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  218. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  219. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  220. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  221. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  222. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  223. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  224. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  225. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  226. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  227. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  228. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  229. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  230. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  231. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  232. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  233. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  234. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  235. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  236. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  237. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  238. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  239. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  240. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  241. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  242. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  243. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  244. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  245. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  246. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  247. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  248. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  249. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  250. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  251. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  252. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  253. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  254. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  255. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  256. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  257. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  258. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  259. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  260. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  261. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  262. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  263. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  264. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  265. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  266. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  267. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  268. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  269. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  270. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  271. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  272. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  273. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  274. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  275. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  276. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  277. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  278. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  279. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  280. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  281. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  282. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  283. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  284. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  285. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  295. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  296. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  297. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  298. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  299. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  300. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  307. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  308. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  309. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  310. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  311. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  312. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  313. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  314. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  315. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  316. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  317. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  318. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  319. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  320. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  321. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  322. data/vendor/eigen/README.md +2 -0
  323. data/vendor/eigen/bench/btl/README +1 -1
  324. data/vendor/eigen/bench/tensors/README +6 -7
  325. data/vendor/eigen/ci/README.md +56 -0
  326. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  327. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  328. data/vendor/eigen/unsupported/README.txt +1 -1
  329. data/vendor/tomotopy/README.kr.rst +78 -0
  330. data/vendor/tomotopy/README.rst +75 -0
  331. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  332. data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
  333. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
  334. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
  335. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
  336. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  337. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  338. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
  339. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
  340. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
  341. data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
  342. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
  343. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
  344. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
  345. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
  346. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
  347. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
  348. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  349. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
  350. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
  351. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
  352. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
  353. data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
  354. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  355. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
  356. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
  357. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  358. data/vendor/tomotopy/src/Utils/math.h +2 -2
  359. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  360. data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
  361. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  362. metadata +64 -18
  363. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  364. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  365. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  366. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  367. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  368. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  369. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -15,7 +15,13 @@ namespace Eigen {
15
15
 
16
16
  namespace internal {
17
17
 
18
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
18
+ enum GEBPPacketSizeType {
19
+ GEBPPacketFull = 0,
20
+ GEBPPacketHalf,
21
+ GEBPPacketQuarter
22
+ };
23
+
24
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
19
25
  class gebp_traits;
20
26
 
21
27
 
@@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
25
31
  return a<=0 ? b : a;
26
32
  }
27
33
 
34
+ #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36
+ #else
37
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38
+ #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39
+
40
+ #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42
+ #else
43
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44
+ #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45
+
46
+ #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
48
+ #else
49
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50
+ #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51
+
28
52
  #if EIGEN_ARCH_i386_OR_x86_64
29
- const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30
- const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31
- const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
53
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
56
+ #elif EIGEN_ARCH_PPC
57
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
32
60
  #else
33
- const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34
- const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35
- const std::ptrdiff_t defaultL3CacheSize = 512*1024;
61
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
36
64
  #endif
37
65
 
66
+ #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67
+ #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68
+ #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69
+
38
70
  /** \internal */
39
71
  struct CacheSizes {
40
72
  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
@@ -50,7 +82,6 @@ struct CacheSizes {
50
82
  std::ptrdiff_t m_l3;
51
83
  };
52
84
 
53
-
54
85
  /** \internal */
55
86
  inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56
87
  {
@@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
101
132
  // at the register level. This small horizontal panel has to stay within L1 cache.
102
133
  std::ptrdiff_t l1, l2, l3;
103
134
  manage_caching_sizes(GetAction, &l1, &l2, &l3);
135
+ #ifdef EIGEN_VECTORIZE_AVX512
136
+ // We need to find a rationale for that, but without this adjustment,
137
+ // performance with AVX512 is pretty bad, like -20% slower.
138
+ // One reason is that with increasing packet-size, the blocking size k
139
+ // has to become pretty small if we want that 1 lhs panel fit within L1.
140
+ // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
141
+ // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142
+ // This is quite small for a good reuse of the accumulation registers.
143
+ l1 *= 4;
144
+ #endif
104
145
 
105
146
  if (num_threads > 1) {
106
147
  typedef typename Traits::ResScalar ResScalar;
@@ -115,7 +156,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
115
156
  // registers. However once the latency is hidden there is no point in
116
157
  // increasing the value of k, so we'll cap it at 320 (value determined
117
158
  // experimentally).
118
- const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
159
+ // To avoid that k vanishes, we make k_cache at least as big as kr
160
+ const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
119
161
  if (k_cache < k) {
120
162
  k = k_cache - (k_cache % kr);
121
163
  eigen_internal_assert(k > 0);
@@ -307,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
307
349
  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
308
350
  }
309
351
 
310
- #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
311
- #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
312
- #else
313
-
314
- // FIXME (a bit overkill maybe ?)
315
-
316
- template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
317
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
318
- {
319
- c = cj.pmadd(a,b,c);
320
- }
321
- };
322
-
323
- template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
324
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
325
- {
326
- t = b; t = cj.pmul(a,t); c = padd(c,t);
327
- }
328
- };
352
+ template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
353
+ struct RhsPanelHelper {
354
+ private:
355
+ static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
356
+ public:
357
+ typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
358
+ };
329
359
 
330
- template<typename CJ, typename A, typename B, typename C, typename T>
331
- EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
332
- {
333
- gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
334
- }
360
+ template <typename Packet>
361
+ struct QuadPacket
362
+ {
363
+ Packet B_0, B1, B2, B3;
364
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
365
+ const Packet& get(const FixedInt<1>&) const { return B1; }
366
+ const Packet& get(const FixedInt<2>&) const { return B2; }
367
+ const Packet& get(const FixedInt<3>&) const { return B3; }
368
+ };
335
369
 
336
- #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
337
- // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
338
- #endif
370
+ template <int N, typename T1, typename T2, typename T3>
371
+ struct packet_conditional { typedef T3 type; };
372
+
373
+ template <typename T1, typename T2, typename T3>
374
+ struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
375
+
376
+ template <typename T1, typename T2, typename T3>
377
+ struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
378
+
379
+ #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380
+ typedef typename packet_conditional<packet_size, \
381
+ typename packet_traits<name ## Scalar>::type, \
382
+ typename packet_traits<name ## Scalar>::half, \
383
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384
+ prefix ## name ## Packet
385
+
386
+ #define PACKET_DECL_COND(name, packet_size) \
387
+ typedef typename packet_conditional<packet_size, \
388
+ typename packet_traits<name ## Scalar>::type, \
389
+ typename packet_traits<name ## Scalar>::half, \
390
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391
+ name ## Packet
392
+
393
+ #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394
+ typedef typename packet_conditional<packet_size, \
395
+ typename packet_traits<Scalar>::type, \
396
+ typename packet_traits<Scalar>::half, \
397
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398
+ prefix ## ScalarPacket
399
+
400
+ #define PACKET_DECL_COND_SCALAR(packet_size) \
401
+ typedef typename packet_conditional<packet_size, \
402
+ typename packet_traits<Scalar>::type, \
403
+ typename packet_traits<Scalar>::half, \
404
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405
+ ScalarPacket
339
406
 
340
407
  /* Vectorization logic
341
408
  * real*real: unpack rhs to constant packets, ...
@@ -347,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
347
414
  * cplx*real : unpack rhs to constant packets, ...
348
415
  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
349
416
  */
350
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
417
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
351
418
  class gebp_traits
352
419
  {
353
420
  public:
@@ -355,13 +422,17 @@ public:
355
422
  typedef _RhsScalar RhsScalar;
356
423
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
357
424
 
425
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
426
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
427
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
428
+
358
429
  enum {
359
430
  ConjLhs = _ConjLhs,
360
431
  ConjRhs = _ConjRhs,
361
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
362
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
363
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
364
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
432
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
433
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
434
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
435
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
365
436
 
366
437
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
367
438
 
@@ -370,10 +441,12 @@ public:
370
441
 
371
442
  // register block size along the M direction (currently, this one cannot be modified)
372
443
  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
373
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
374
- // we assume 16 registers
444
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
445
+ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
446
+ // we assume 16 registers or more
375
447
  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
376
448
  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
449
+ // Bug 1515: MSVC prior to v19.14 yields to register spilling.
377
450
  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
378
451
  #else
379
452
  mr = default_mr,
@@ -383,37 +456,41 @@ public:
383
456
  RhsProgress = 1
384
457
  };
385
458
 
386
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
387
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
388
- typedef typename packet_traits<ResScalar>::type _ResPacket;
389
459
 
390
460
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
391
461
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
392
462
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
463
+ typedef LhsPacket LhsPacket4Packing;
393
464
 
465
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
394
466
  typedef ResPacket AccPacket;
395
467
 
396
468
  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
397
469
  {
398
470
  p = pset1<ResPacket>(ResScalar(0));
399
471
  }
400
-
401
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
402
- {
403
- pbroadcast4(b, b0, b1, b2, b3);
404
- }
405
-
406
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
407
- // {
408
- // pbroadcast2(b, b0, b1);
409
- // }
410
-
472
+
411
473
  template<typename RhsPacketType>
412
474
  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
413
475
  {
414
476
  dest = pset1<RhsPacketType>(*b);
415
477
  }
416
-
478
+
479
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
480
+ {
481
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
482
+ }
483
+
484
+ template<typename RhsPacketType>
485
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
486
+ {
487
+ loadRhs(b, dest);
488
+ }
489
+
490
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
491
+ {
492
+ }
493
+
417
494
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
418
495
  {
419
496
  dest = ploadquad<RhsPacket>(b);
@@ -431,8 +508,8 @@ public:
431
508
  dest = ploadu<LhsPacketType>(a);
432
509
  }
433
510
 
434
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
435
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
511
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
512
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
436
513
  {
437
514
  conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
438
515
  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -447,6 +524,12 @@ public:
447
524
  #endif
448
525
  }
449
526
 
527
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
528
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
529
+ {
530
+ madd(a, b.get(lane), c, tmp, lane);
531
+ }
532
+
450
533
  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
451
534
  {
452
535
  r = pmadd(c,alpha,r);
@@ -460,21 +543,25 @@ public:
460
543
 
461
544
  };
462
545
 
463
- template<typename RealScalar, bool _ConjLhs>
464
- class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
546
+ template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
547
+ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
465
548
  {
466
549
  public:
467
550
  typedef std::complex<RealScalar> LhsScalar;
468
551
  typedef RealScalar RhsScalar;
469
552
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
470
553
 
554
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
555
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
556
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
557
+
471
558
  enum {
472
559
  ConjLhs = _ConjLhs,
473
560
  ConjRhs = false,
474
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
475
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
476
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
477
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
561
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
562
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
478
565
 
479
566
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
480
567
  nr = 4,
@@ -489,13 +576,12 @@ public:
489
576
  RhsProgress = 1
490
577
  };
491
578
 
492
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
493
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
494
- typedef typename packet_traits<ResScalar>::type _ResPacket;
495
-
496
579
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
497
580
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
498
581
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
582
+ typedef LhsPacket LhsPacket4Packing;
583
+
584
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
499
585
 
500
586
  typedef ResPacket AccPacket;
501
587
 
@@ -504,42 +590,64 @@ public:
504
590
  p = pset1<ResPacket>(ResScalar(0));
505
591
  }
506
592
 
507
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
593
+ template<typename RhsPacketType>
594
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
508
595
  {
509
- dest = pset1<RhsPacket>(*b);
596
+ dest = pset1<RhsPacketType>(*b);
597
+ }
598
+
599
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
600
+ {
601
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
510
602
  }
603
+
604
+ template<typename RhsPacketType>
605
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
606
+ {
607
+ loadRhs(b, dest);
608
+ }
609
+
610
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
611
+ {}
511
612
 
512
613
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
513
614
  {
514
- dest = pset1<RhsPacket>(*b);
615
+ loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
515
616
  }
516
617
 
517
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
618
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
518
619
  {
519
- dest = pload<LhsPacket>(a);
620
+ // FIXME we can do better!
621
+ // what we want here is a ploadheight
622
+ RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623
+ dest = ploadquad<RhsPacket>(tmp);
520
624
  }
521
625
 
522
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
626
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
523
627
  {
524
- dest = ploadu<LhsPacket>(a);
628
+ eigen_internal_assert(RhsPacketSize<=8);
629
+ dest = pset1<RhsPacket>(*b);
525
630
  }
526
631
 
527
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
632
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
528
633
  {
529
- pbroadcast4(b, b0, b1, b2, b3);
634
+ dest = pload<LhsPacket>(a);
530
635
  }
531
-
532
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
533
- // {
534
- // pbroadcast2(b, b0, b1);
535
- // }
536
636
 
537
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
637
+ template<typename LhsPacketType>
638
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
639
+ {
640
+ dest = ploadu<LhsPacketType>(a);
641
+ }
642
+
643
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
644
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
538
645
  {
539
646
  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
540
647
  }
541
648
 
542
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
649
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
650
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
543
651
  {
544
652
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
545
653
  EIGEN_UNUSED_VARIABLE(tmp);
@@ -554,13 +662,20 @@ public:
554
662
  c += a * b;
555
663
  }
556
664
 
557
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
665
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
666
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
558
667
  {
668
+ madd(a, b.get(lane), c, tmp, lane);
669
+ }
670
+
671
+ template <typename ResPacketType, typename AccPacketType>
672
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
673
+ {
674
+ conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
559
675
  r = cj.pmadd(c,alpha,r);
560
676
  }
561
677
 
562
678
  protected:
563
- conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
564
679
  };
565
680
 
566
681
  template<typename Packet>
@@ -579,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
579
694
  return res;
580
695
  }
581
696
 
697
+ // note that for DoublePacket<RealPacket> the "4" in "downto4"
698
+ // corresponds to the number of complexes, so it means "8"
699
+ // it terms of real coefficients.
700
+
582
701
  template<typename Packet>
583
- const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
702
+ const DoublePacket<Packet>&
703
+ predux_half_dowto4(const DoublePacket<Packet> &a,
704
+ typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
584
705
  {
585
706
  return a;
586
707
  }
587
708
 
588
- template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
709
+ template<typename Packet>
710
+ DoublePacket<typename unpacket_traits<Packet>::half>
711
+ predux_half_dowto4(const DoublePacket<Packet> &a,
712
+ typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
713
+ {
714
+ // yes, that's pretty hackish :(
715
+ DoublePacket<typename unpacket_traits<Packet>::half> res;
716
+ typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717
+ typedef typename packet_traits<Cplx>::type CplxPacket;
718
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719
+ res.second = predux_half_dowto4(CplxPacket(a.second)).v;
720
+ return res;
721
+ }
722
+
723
+ // same here, "quad" actually means "8" in terms of real coefficients
724
+ template<typename Scalar, typename RealPacket>
725
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
726
+ typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
727
+ {
728
+ dest.first = pset1<RealPacket>(numext::real(*b));
729
+ dest.second = pset1<RealPacket>(numext::imag(*b));
730
+ }
731
+
732
+ template<typename Scalar, typename RealPacket>
733
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
734
+ typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
735
+ {
736
+ // yes, that's pretty hackish too :(
737
+ typedef typename NumTraits<Scalar>::Real RealScalar;
738
+ RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739
+ RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740
+ dest.first = ploadquad<RealPacket>(r);
741
+ dest.second = ploadquad<RealPacket>(i);
742
+ }
743
+
744
+
745
+ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
746
+ typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
747
+ };
589
748
  // template<typename Packet>
590
749
  // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
591
750
  // {
@@ -595,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
595
754
  // return res;
596
755
  // }
597
756
 
598
- template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
599
- class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
757
+ template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
758
+ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
600
759
  {
601
760
  public:
602
761
  typedef std::complex<RealScalar> Scalar;
@@ -604,15 +763,21 @@ public:
604
763
  typedef std::complex<RealScalar> RhsScalar;
605
764
  typedef std::complex<RealScalar> ResScalar;
606
765
 
766
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
767
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
768
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
769
+ PACKET_DECL_COND(Real, _PacketSize);
770
+ PACKET_DECL_COND_SCALAR(_PacketSize);
771
+
607
772
  enum {
608
773
  ConjLhs = _ConjLhs,
609
774
  ConjRhs = _ConjRhs,
610
- Vectorizable = packet_traits<RealScalar>::Vectorizable
611
- && packet_traits<Scalar>::Vectorizable,
612
- RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
613
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
614
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
615
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
775
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable
776
+ && unpacket_traits<ScalarPacket>::vectorizable,
777
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
779
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
616
781
 
617
782
  // FIXME: should depend on NumberOfRegisters
618
783
  nr = 4,
@@ -622,14 +787,16 @@ public:
622
787
  RhsProgress = 1
623
788
  };
624
789
 
625
- typedef typename packet_traits<RealScalar>::type RealPacket;
626
- typedef typename packet_traits<Scalar>::type ScalarPacket;
627
- typedef DoublePacket<RealPacket> DoublePacketType;
790
+ typedef DoublePacket<RealPacket> DoublePacketType;
628
791
 
792
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
629
793
  typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
630
794
  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
631
795
  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
632
796
  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
797
+
798
+ // this actualy holds 8 packets!
799
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
633
800
 
634
801
  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
635
802
 
@@ -640,51 +807,49 @@ public:
640
807
  }
641
808
 
642
809
  // Scalar path
643
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
810
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
644
811
  {
645
- dest = pset1<ResPacket>(*b);
812
+ dest = pset1<ScalarPacket>(*b);
646
813
  }
647
814
 
648
815
  // Vectorized path
649
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
816
+ template<typename RealPacketType>
817
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
650
818
  {
651
- dest.first = pset1<RealPacket>(real(*b));
652
- dest.second = pset1<RealPacket>(imag(*b));
819
+ dest.first = pset1<RealPacketType>(numext::real(*b));
820
+ dest.second = pset1<RealPacketType>(numext::imag(*b));
653
821
  }
654
-
655
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
822
+
823
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
656
824
  {
657
- loadRhs(b,dest);
825
+ loadRhs(b, dest.B_0);
826
+ loadRhs(b + 1, dest.B1);
827
+ loadRhs(b + 2, dest.B2);
828
+ loadRhs(b + 3, dest.B3);
658
829
  }
659
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
830
+
831
+ // Scalar path
832
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
660
833
  {
661
- eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
662
- loadRhs(b,dest);
834
+ loadRhs(b, dest);
663
835
  }
664
-
665
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
836
+
837
+ // Vectorized path
838
+ template<typename RealPacketType>
839
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
666
840
  {
667
- // FIXME not sure that's the best way to implement it!
668
- loadRhs(b+0, b0);
669
- loadRhs(b+1, b1);
670
- loadRhs(b+2, b2);
671
- loadRhs(b+3, b3);
841
+ loadRhs(b, dest);
672
842
  }
843
+
844
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
673
845
 
674
- // Vectorized path
675
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
846
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
676
847
  {
677
- // FIXME not sure that's the best way to implement it!
678
- loadRhs(b+0, b0);
679
- loadRhs(b+1, b1);
848
+ loadRhs(b,dest);
680
849
  }
681
-
682
- // Scalar path
683
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
850
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
684
851
  {
685
- // FIXME not sure that's the best way to implement it!
686
- loadRhs(b+0, b0);
687
- loadRhs(b+1, b1);
852
+ loadQuadToDoublePacket(b,dest);
688
853
  }
689
854
 
690
855
  // nothing special here
@@ -693,47 +858,59 @@ public:
693
858
  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
694
859
  }
695
860
 
696
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
861
+ template<typename LhsPacketType>
862
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
697
863
  {
698
- dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
864
+ dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
699
865
  }
700
866
 
701
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
867
+ template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
868
+ EIGEN_STRONG_INLINE
869
+ typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
870
+ madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
702
871
  {
703
872
  c.first = padd(pmul(a,b.first), c.first);
704
873
  c.second = padd(pmul(a,b.second),c.second);
705
874
  }
706
875
 
707
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
876
+ template<typename LaneIdType>
877
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
708
878
  {
709
879
  c = cj.pmadd(a,b,c);
710
880
  }
881
+
882
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
883
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
884
+ {
885
+ madd(a, b.get(lane), c, tmp, lane);
886
+ }
711
887
 
712
888
  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
713
889
 
714
- EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
890
+ template<typename RealPacketType, typename ResPacketType>
891
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
715
892
  {
716
893
  // assemble c
717
- ResPacket tmp;
894
+ ResPacketType tmp;
718
895
  if((!ConjLhs)&&(!ConjRhs))
719
896
  {
720
- tmp = pcplxflip(pconj(ResPacket(c.second)));
721
- tmp = padd(ResPacket(c.first),tmp);
897
+ tmp = pcplxflip(pconj(ResPacketType(c.second)));
898
+ tmp = padd(ResPacketType(c.first),tmp);
722
899
  }
723
900
  else if((!ConjLhs)&&(ConjRhs))
724
901
  {
725
- tmp = pconj(pcplxflip(ResPacket(c.second)));
726
- tmp = padd(ResPacket(c.first),tmp);
902
+ tmp = pconj(pcplxflip(ResPacketType(c.second)));
903
+ tmp = padd(ResPacketType(c.first),tmp);
727
904
  }
728
905
  else if((ConjLhs)&&(!ConjRhs))
729
906
  {
730
- tmp = pcplxflip(ResPacket(c.second));
731
- tmp = padd(pconj(ResPacket(c.first)),tmp);
907
+ tmp = pcplxflip(ResPacketType(c.second));
908
+ tmp = padd(pconj(ResPacketType(c.first)),tmp);
732
909
  }
733
910
  else if((ConjLhs)&&(ConjRhs))
734
911
  {
735
- tmp = pcplxflip(ResPacket(c.second));
736
- tmp = psub(pconj(ResPacket(c.first)),tmp);
912
+ tmp = pcplxflip(ResPacketType(c.second));
913
+ tmp = psub(pconj(ResPacketType(c.first)),tmp);
737
914
  }
738
915
 
739
916
  r = pmadd(tmp,alpha,r);
@@ -743,8 +920,8 @@ protected:
743
920
  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
744
921
  };
745
922
 
746
- template<typename RealScalar, bool _ConjRhs>
747
- class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
923
+ template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
924
+ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
748
925
  {
749
926
  public:
750
927
  typedef std::complex<RealScalar> Scalar;
@@ -752,14 +929,25 @@ public:
752
929
  typedef Scalar RhsScalar;
753
930
  typedef Scalar ResScalar;
754
931
 
932
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
933
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
934
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
935
+ PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
936
+ PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
937
+
938
+ #undef PACKET_DECL_COND_SCALAR_PREFIX
939
+ #undef PACKET_DECL_COND_PREFIX
940
+ #undef PACKET_DECL_COND_SCALAR
941
+ #undef PACKET_DECL_COND
942
+
755
943
  enum {
756
944
  ConjLhs = false,
757
945
  ConjRhs = _ConjRhs,
758
- Vectorizable = packet_traits<RealScalar>::Vectorizable
759
- && packet_traits<Scalar>::Vectorizable,
760
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
761
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
762
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
946
+ Vectorizable = unpacket_traits<_RealPacket>::vectorizable
947
+ && unpacket_traits<_ScalarPacket>::vectorizable,
948
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
763
951
 
764
952
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
765
953
  // FIXME: should depend on NumberOfRegisters
@@ -770,14 +958,11 @@ public:
770
958
  RhsProgress = 1
771
959
  };
772
960
 
773
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
774
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
775
- typedef typename packet_traits<ResScalar>::type _ResPacket;
776
-
777
961
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
778
962
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
779
963
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
780
-
964
+ typedef LhsPacket LhsPacket4Packing;
965
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
781
966
  typedef ResPacket AccPacket;
782
967
 
783
968
  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -785,22 +970,25 @@ public:
785
970
  p = pset1<ResPacket>(ResScalar(0));
786
971
  }
787
972
 
788
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
973
+ template<typename RhsPacketType>
974
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
789
975
  {
790
- dest = pset1<RhsPacket>(*b);
976
+ dest = pset1<RhsPacketType>(*b);
791
977
  }
792
-
793
- void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
978
+
979
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
794
980
  {
795
- pbroadcast4(b, b0, b1, b2, b3);
981
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
796
982
  }
797
-
798
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
799
- // {
800
- // // FIXME not sure that's the best way to implement it!
801
- // b0 = pload1<RhsPacket>(b+0);
802
- // b1 = pload1<RhsPacket>(b+1);
803
- // }
983
+
984
+ template<typename RhsPacketType>
985
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
986
+ {
987
+ loadRhs(b, dest);
988
+ }
989
+
990
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
991
+ {}
804
992
 
805
993
  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
806
994
  {
@@ -809,21 +997,23 @@ public:
809
997
 
810
998
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
811
999
  {
812
- eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
813
- loadRhs(b,dest);
1000
+ dest = ploadquad<RhsPacket>(b);
814
1001
  }
815
1002
 
816
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
1003
+ template<typename LhsPacketType>
1004
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
817
1005
  {
818
- dest = ploaddup<LhsPacket>(a);
1006
+ dest = ploaddup<LhsPacketType>(a);
819
1007
  }
820
1008
 
821
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
1009
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1010
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
822
1011
  {
823
1012
  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
824
1013
  }
825
1014
 
826
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
1015
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1016
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
827
1017
  {
828
1018
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
829
1019
  EIGEN_UNUSED_VARIABLE(tmp);
@@ -839,16 +1029,24 @@ public:
839
1029
  c += a * b;
840
1030
  }
841
1031
 
842
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
1032
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1033
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1034
+ {
1035
+ madd(a, b.get(lane), c, tmp, lane);
1036
+ }
1037
+
1038
+ template <typename ResPacketType, typename AccPacketType>
1039
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
843
1040
  {
1041
+ conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
844
1042
  r = cj.pmadd(alpha,c,r);
845
1043
  }
846
1044
 
847
1045
  protected:
848
- conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
1046
+
849
1047
  };
850
1048
 
851
- /* optimized GEneral packed Block * packed Panel product kernel
1049
+ /* optimized General packed Block * packed Panel product kernel
852
1050
  *
853
1051
  * Mixing type logic: C += A * B
854
1052
  * | A | B | comments
@@ -858,26 +1056,47 @@ protected:
858
1056
  template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
859
1057
  struct gebp_kernel
860
1058
  {
861
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
1059
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1060
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1061
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1062
+
862
1063
  typedef typename Traits::ResScalar ResScalar;
863
1064
  typedef typename Traits::LhsPacket LhsPacket;
864
1065
  typedef typename Traits::RhsPacket RhsPacket;
865
1066
  typedef typename Traits::ResPacket ResPacket;
866
1067
  typedef typename Traits::AccPacket AccPacket;
1068
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
1069
+
1070
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1071
+
1072
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
867
1073
 
868
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
869
1074
  typedef typename SwappedTraits::ResScalar SResScalar;
870
1075
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
871
1076
  typedef typename SwappedTraits::RhsPacket SRhsPacket;
872
1077
  typedef typename SwappedTraits::ResPacket SResPacket;
873
1078
  typedef typename SwappedTraits::AccPacket SAccPacket;
874
1079
 
1080
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1081
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1082
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
1083
+ typedef typename HalfTraits::AccPacket AccPacketHalf;
1084
+
1085
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1086
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1087
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1088
+ typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1089
+
875
1090
  typedef typename DataMapper::LinearMapper LinearMapper;
876
1091
 
877
1092
  enum {
878
1093
  Vectorizable = Traits::Vectorizable,
879
1094
  LhsProgress = Traits::LhsProgress,
1095
+ LhsProgressHalf = HalfTraits::LhsProgress,
1096
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
880
1097
  RhsProgress = Traits::RhsProgress,
1098
+ RhsProgressHalf = HalfTraits::RhsProgress,
1099
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
881
1100
  ResPacketSize = Traits::ResPacketSize
882
1101
  };
883
1102
 
@@ -887,6 +1106,299 @@ struct gebp_kernel
887
1106
  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
888
1107
  };
889
1108
 
1109
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1110
+ int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1111
+ struct last_row_process_16_packets
1112
+ {
1113
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1114
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1115
+
1116
+ typedef typename Traits::ResScalar ResScalar;
1117
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1118
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1119
+ typedef typename SwappedTraits::ResPacket SResPacket;
1120
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1121
+
1122
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1123
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1124
+ ResScalar alpha, SAccPacket &C0)
1125
+ {
1126
+ EIGEN_UNUSED_VARIABLE(res);
1127
+ EIGEN_UNUSED_VARIABLE(straits);
1128
+ EIGEN_UNUSED_VARIABLE(blA);
1129
+ EIGEN_UNUSED_VARIABLE(blB);
1130
+ EIGEN_UNUSED_VARIABLE(depth);
1131
+ EIGEN_UNUSED_VARIABLE(endk);
1132
+ EIGEN_UNUSED_VARIABLE(i);
1133
+ EIGEN_UNUSED_VARIABLE(j2);
1134
+ EIGEN_UNUSED_VARIABLE(alpha);
1135
+ EIGEN_UNUSED_VARIABLE(C0);
1136
+ }
1137
+ };
1138
+
1139
+
1140
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1141
+ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1142
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1143
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1144
+
1145
+ typedef typename Traits::ResScalar ResScalar;
1146
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1147
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1148
+ typedef typename SwappedTraits::ResPacket SResPacket;
1149
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1150
+
1151
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1152
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1153
+ ResScalar alpha, SAccPacket &C0)
1154
+ {
1155
+ typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1156
+ typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1157
+ typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1158
+ typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1159
+
1160
+ SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161
+ SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1162
+
1163
+ if (depth - endk > 0)
1164
+ {
1165
+ // We have to handle the last row(s) of the rhs, which
1166
+ // correspond to a half-packet
1167
+ SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1168
+
1169
+ for (Index kk = endk; kk < depth; kk++)
1170
+ {
1171
+ SLhsPacketQuarter a0;
1172
+ SRhsPacketQuarter b0;
1173
+ straits.loadLhsUnaligned(blB, a0);
1174
+ straits.loadRhs(blA, b0);
1175
+ straits.madd(a0,b0,c0,b0, fix<0>);
1176
+ blB += SwappedTraits::LhsProgress/4;
1177
+ blA += 1;
1178
+ }
1179
+ straits.acc(c0, alphav, R);
1180
+ }
1181
+ else
1182
+ {
1183
+ straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1184
+ }
1185
+ res.scatterPacket(i, j2, R);
1186
+ }
1187
+ };
1188
+
1189
+ template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1190
+ struct lhs_process_one_packet
1191
+ {
1192
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1193
+
1194
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1195
+ {
1196
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1197
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1198
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205
+ __asm__ ("" : "+x,m" (*A0));
1206
+ #endif
1207
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1208
+ }
1209
+
1210
+ EIGEN_STRONG_INLINE void operator()(
1211
+ const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1212
+ Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213
+ int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1214
+ {
1215
+ GEBPTraits traits;
1216
+
1217
+ // loops on each largest micro horizontal panel of lhs
1218
+ // (LhsProgress x depth)
1219
+ for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1220
+ {
1221
+ // loops on each largest micro vertical panel of rhs (depth * nr)
1222
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
1223
+ {
1224
+ // We select a LhsProgress x nr micro block of res
1225
+ // which is entirely stored into 1 x nr registers.
1226
+
1227
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1228
+ prefetch(&blA[0]);
1229
+
1230
+ // gets res block as register
1231
+ AccPacket C0, C1, C2, C3;
1232
+ traits.initAcc(C0);
1233
+ traits.initAcc(C1);
1234
+ traits.initAcc(C2);
1235
+ traits.initAcc(C3);
1236
+ // To improve instruction pipelining, let's double the accumulation registers:
1237
+ // even k will accumulate in C*, while odd k will accumulate in D*.
1238
+ // This trick is crutial to get good performance with FMA, otherwise it is
1239
+ // actually faster to perform separated MUL+ADD because of a naturally
1240
+ // better instruction-level parallelism.
1241
+ AccPacket D0, D1, D2, D3;
1242
+ traits.initAcc(D0);
1243
+ traits.initAcc(D1);
1244
+ traits.initAcc(D2);
1245
+ traits.initAcc(D3);
1246
+
1247
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1251
+
1252
+ r0.prefetch(prefetch_res_offset);
1253
+ r1.prefetch(prefetch_res_offset);
1254
+ r2.prefetch(prefetch_res_offset);
1255
+ r3.prefetch(prefetch_res_offset);
1256
+
1257
+ // performs "inner" products
1258
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1259
+ prefetch(&blB[0]);
1260
+ LhsPacket A0, A1;
1261
+
1262
+ for(Index k=0; k<peeled_kc; k+=pk)
1263
+ {
1264
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1265
+ RhsPacketx4 rhs_panel;
1266
+ RhsPacket T0;
1267
+
1268
+ internal::prefetch(blB+(48+0));
1269
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270
+ peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272
+ peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273
+ internal::prefetch(blB+(48+16));
1274
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275
+ peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277
+ peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1278
+
1279
+ blB += pk*4*RhsProgress;
1280
+ blA += pk*LhsProgress;
1281
+
1282
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1283
+ }
1284
+ C0 = padd(C0,D0);
1285
+ C1 = padd(C1,D1);
1286
+ C2 = padd(C2,D2);
1287
+ C3 = padd(C3,D3);
1288
+
1289
+ // process remaining peeled loop
1290
+ for(Index k=peeled_kc; k<depth; k++)
1291
+ {
1292
+ RhsPacketx4 rhs_panel;
1293
+ RhsPacket T0;
1294
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295
+ blB += 4*RhsProgress;
1296
+ blA += LhsProgress;
1297
+ }
1298
+
1299
+ ResPacket R0, R1;
1300
+ ResPacket alphav = pset1<ResPacket>(alpha);
1301
+
1302
+ R0 = r0.template loadPacket<ResPacket>(0);
1303
+ R1 = r1.template loadPacket<ResPacket>(0);
1304
+ traits.acc(C0, alphav, R0);
1305
+ traits.acc(C1, alphav, R1);
1306
+ r0.storePacket(0, R0);
1307
+ r1.storePacket(0, R1);
1308
+
1309
+ R0 = r2.template loadPacket<ResPacket>(0);
1310
+ R1 = r3.template loadPacket<ResPacket>(0);
1311
+ traits.acc(C2, alphav, R0);
1312
+ traits.acc(C3, alphav, R1);
1313
+ r2.storePacket(0, R0);
1314
+ r3.storePacket(0, R1);
1315
+ }
1316
+
1317
+ // Deal with remaining columns of the rhs
1318
+ for(Index j2=packet_cols4; j2<cols; j2++)
1319
+ {
1320
+ // One column at a time
1321
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1322
+ prefetch(&blA[0]);
1323
+
1324
+ // gets res block as register
1325
+ AccPacket C0;
1326
+ traits.initAcc(C0);
1327
+
1328
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1329
+
1330
+ // performs "inner" products
1331
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1332
+ LhsPacket A0;
1333
+
1334
+ for(Index k= 0; k<peeled_kc; k+=pk)
1335
+ {
1336
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1337
+ RhsPacket B_0;
1338
+
1339
+ #define EIGEN_GEBGP_ONESTEP(K) \
1340
+ do { \
1341
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1343
+ /* FIXME: why unaligned???? */ \
1344
+ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1348
+ } while(false);
1349
+
1350
+ EIGEN_GEBGP_ONESTEP(0);
1351
+ EIGEN_GEBGP_ONESTEP(1);
1352
+ EIGEN_GEBGP_ONESTEP(2);
1353
+ EIGEN_GEBGP_ONESTEP(3);
1354
+ EIGEN_GEBGP_ONESTEP(4);
1355
+ EIGEN_GEBGP_ONESTEP(5);
1356
+ EIGEN_GEBGP_ONESTEP(6);
1357
+ EIGEN_GEBGP_ONESTEP(7);
1358
+
1359
+ blB += pk*RhsProgress;
1360
+ blA += pk*LhsProgress;
1361
+
1362
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1363
+ }
1364
+
1365
+ // process remaining peeled loop
1366
+ for(Index k=peeled_kc; k<depth; k++)
1367
+ {
1368
+ RhsPacket B_0;
1369
+ EIGEN_GEBGP_ONESTEP(0);
1370
+ blB += RhsProgress;
1371
+ blA += LhsProgress;
1372
+ }
1373
+ #undef EIGEN_GEBGP_ONESTEP
1374
+ ResPacket R0;
1375
+ ResPacket alphav = pset1<ResPacket>(alpha);
1376
+ R0 = r0.template loadPacket<ResPacket>(0);
1377
+ traits.acc(C0, alphav, R0);
1378
+ r0.storePacket(0, R0);
1379
+ }
1380
+ }
1381
+ }
1382
+ };
1383
+
1384
+ template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1385
+ struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1386
+ {
1387
+
1388
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1389
+ {
1390
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1391
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1392
+ traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394
+ traits.madd(*A0, *B_0, *C0, *B_0);
1395
+ traits.madd(*A0, *B1, *C1, *B1);
1396
+ traits.madd(*A0, *B2, *C2, *B2);
1397
+ traits.madd(*A0, *B3, *C3, *B3);
1398
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1399
+ }
1400
+ };
1401
+
890
1402
  template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
891
1403
  EIGEN_DONT_INLINE
892
1404
  void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@@ -903,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
903
1415
  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904
1416
  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905
1417
  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906
- const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
1418
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419
+ const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420
+ const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
907
1421
  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
908
1422
  const Index peeled_kc = depth & ~(pk-1);
909
- const Index prefetch_res_offset = 32/sizeof(ResScalar);
1423
+ const int prefetch_res_offset = 32/sizeof(ResScalar);
910
1424
  // const Index depth2 = depth & ~1;
911
1425
 
912
1426
  //---------- Process 3 * LhsProgress rows at once ----------
@@ -964,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
964
1478
  for(Index k=0; k<peeled_kc; k+=pk)
965
1479
  {
966
1480
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
967
- RhsPacket B_0, T0;
1481
+ // 15 registers are taken (12 for acc, 2 for lhs).
1482
+ RhsPanel15 rhs_panel;
1483
+ RhsPacket T0;
968
1484
  LhsPacket A2;
969
-
970
- #define EIGEN_GEBP_ONESTEP(K) \
971
- do { \
972
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1485
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1486
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1487
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1488
+ // which is not good for pipelining
1489
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1490
+ #else
1491
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1492
+ #endif
1493
+ #define EIGEN_GEBP_ONESTEP(K) \
1494
+ do { \
1495
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973
1496
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
975
- if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
976
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979
- traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980
- traits.madd(A0, B_0, C0, T0); \
981
- traits.madd(A1, B_0, C4, T0); \
982
- traits.madd(A2, B_0, C8, B_0); \
983
- traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984
- traits.madd(A0, B_0, C1, T0); \
985
- traits.madd(A1, B_0, C5, T0); \
986
- traits.madd(A2, B_0, C9, B_0); \
987
- traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988
- traits.madd(A0, B_0, C2, T0); \
989
- traits.madd(A1, B_0, C6, T0); \
990
- traits.madd(A2, B_0, C10, B_0); \
991
- traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992
- traits.madd(A0, B_0, C3 , T0); \
993
- traits.madd(A1, B_0, C7, T0); \
994
- traits.madd(A2, B_0, C11, B_0); \
995
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
996
- } while(false)
1497
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1500
+ } /* Bug 953 */ \
1501
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505
+ traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509
+ traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513
+ traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517
+ traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1522
+ } while (false)
997
1523
 
998
1524
  internal::prefetch(blB);
999
1525
  EIGEN_GEBP_ONESTEP(0);
@@ -1013,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1013
1539
  // process remaining peeled loop
1014
1540
  for(Index k=peeled_kc; k<depth; k++)
1015
1541
  {
1016
- RhsPacket B_0, T0;
1542
+ RhsPanel15 rhs_panel;
1543
+ RhsPacket T0;
1017
1544
  LhsPacket A2;
1018
1545
  EIGEN_GEBP_ONESTEP(0);
1019
1546
  blB += 4*RhsProgress;
@@ -1025,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1025
1552
  ResPacket R0, R1, R2;
1026
1553
  ResPacket alphav = pset1<ResPacket>(alpha);
1027
1554
 
1028
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1555
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1031
1558
  traits.acc(C0, alphav, R0);
1032
1559
  traits.acc(C4, alphav, R1);
1033
1560
  traits.acc(C8, alphav, R2);
@@ -1035,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1035
1562
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1036
1563
  r0.storePacket(2 * Traits::ResPacketSize, R2);
1037
1564
 
1038
- R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039
- R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040
- R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1565
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1041
1568
  traits.acc(C1, alphav, R0);
1042
1569
  traits.acc(C5, alphav, R1);
1043
1570
  traits.acc(C9, alphav, R2);
@@ -1045,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1045
1572
  r1.storePacket(1 * Traits::ResPacketSize, R1);
1046
1573
  r1.storePacket(2 * Traits::ResPacketSize, R2);
1047
1574
 
1048
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050
- R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1575
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1051
1578
  traits.acc(C2, alphav, R0);
1052
1579
  traits.acc(C6, alphav, R1);
1053
1580
  traits.acc(C10, alphav, R2);
@@ -1055,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1055
1582
  r2.storePacket(1 * Traits::ResPacketSize, R1);
1056
1583
  r2.storePacket(2 * Traits::ResPacketSize, R2);
1057
1584
 
1058
- R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059
- R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060
- R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1585
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1061
1588
  traits.acc(C3, alphav, R0);
1062
1589
  traits.acc(C7, alphav, R1);
1063
1590
  traits.acc(C11, alphav, R2);
@@ -1093,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1093
1620
  {
1094
1621
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1095
1622
  RhsPacket B_0;
1096
- #define EIGEN_GEBGP_ONESTEP(K) \
1097
- do { \
1098
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1623
+ #define EIGEN_GEBGP_ONESTEP(K) \
1624
+ do { \
1625
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099
1626
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104
- traits.madd(A0, B_0, C0, B_0); \
1105
- traits.madd(A1, B_0, C4, B_0); \
1106
- traits.madd(A2, B_0, C8, B_0); \
1107
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1108
- } while(false)
1109
-
1627
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1635
+ } while (false)
1636
+
1110
1637
  EIGEN_GEBGP_ONESTEP(0);
1111
1638
  EIGEN_GEBGP_ONESTEP(1);
1112
1639
  EIGEN_GEBGP_ONESTEP(2);
@@ -1116,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1116
1643
  EIGEN_GEBGP_ONESTEP(6);
1117
1644
  EIGEN_GEBGP_ONESTEP(7);
1118
1645
 
1119
- blB += pk*RhsProgress;
1120
- blA += pk*3*Traits::LhsProgress;
1646
+ blB += int(pk) * int(RhsProgress);
1647
+ blA += int(pk) * 3 * int(Traits::LhsProgress);
1121
1648
 
1122
1649
  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1123
1650
  }
@@ -1134,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1134
1661
  ResPacket R0, R1, R2;
1135
1662
  ResPacket alphav = pset1<ResPacket>(alpha);
1136
1663
 
1137
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1664
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1140
1667
  traits.acc(C0, alphav, R0);
1141
1668
  traits.acc(C4, alphav, R1);
1142
1669
  traits.acc(C8, alphav, R2);
@@ -1195,7 +1722,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1195
1722
  for(Index k=0; k<peeled_kc; k+=pk)
1196
1723
  {
1197
1724
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1198
- RhsPacket B_0, B1, B2, B3, T0;
1725
+ RhsPacketx4 rhs_panel;
1726
+ RhsPacket T0;
1199
1727
 
1200
1728
  // NOTE: the begin/end asm comments below work around bug 935!
1201
1729
  // but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1204,24 +1732,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1204
1732
  #else
1205
1733
  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1206
1734
  #endif
1207
- #define EIGEN_GEBGP_ONESTEP(K) \
1208
- do { \
1209
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213
- traits.madd(A0, B_0, C0, T0); \
1214
- traits.madd(A1, B_0, C4, B_0); \
1215
- traits.madd(A0, B1, C1, T0); \
1216
- traits.madd(A1, B1, C5, B1); \
1217
- traits.madd(A0, B2, C2, T0); \
1218
- traits.madd(A1, B2, C6, B2); \
1219
- traits.madd(A0, B3, C3, T0); \
1220
- traits.madd(A1, B3, C7, B3); \
1221
- EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1223
- } while(false)
1224
-
1735
+ #define EIGEN_GEBGP_ONESTEP(K) \
1736
+ do { \
1737
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1751
+ } while (false)
1752
+
1225
1753
  internal::prefetch(blB+(48+0));
1226
1754
  EIGEN_GEBGP_ONESTEP(0);
1227
1755
  EIGEN_GEBGP_ONESTEP(1);
@@ -1241,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1241
1769
  // process remaining peeled loop
1242
1770
  for(Index k=peeled_kc; k<depth; k++)
1243
1771
  {
1244
- RhsPacket B_0, B1, B2, B3, T0;
1772
+ RhsPacketx4 rhs_panel;
1773
+ RhsPacket T0;
1245
1774
  EIGEN_GEBGP_ONESTEP(0);
1246
1775
  blB += 4*RhsProgress;
1247
1776
  blA += 2*Traits::LhsProgress;
@@ -1251,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1251
1780
  ResPacket R0, R1, R2, R3;
1252
1781
  ResPacket alphav = pset1<ResPacket>(alpha);
1253
1782
 
1254
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256
- R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257
- R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1783
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1258
1787
  traits.acc(C0, alphav, R0);
1259
1788
  traits.acc(C4, alphav, R1);
1260
1789
  traits.acc(C1, alphav, R2);
@@ -1264,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1264
1793
  r1.storePacket(0 * Traits::ResPacketSize, R2);
1265
1794
  r1.storePacket(1 * Traits::ResPacketSize, R3);
1266
1795
 
1267
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269
- R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270
- R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1796
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1271
1800
  traits.acc(C2, alphav, R0);
1272
1801
  traits.acc(C6, alphav, R1);
1273
1802
  traits.acc(C3, alphav, R2);
@@ -1312,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1312
1841
  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313
1842
  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314
1843
  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315
- traits.madd(A0, B_0, C0, B1); \
1316
- traits.madd(A1, B_0, C4, B_0); \
1844
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
1845
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1317
1846
  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1318
1847
  } while(false)
1319
1848
 
@@ -1326,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1326
1855
  EIGEN_GEBGP_ONESTEP(6);
1327
1856
  EIGEN_GEBGP_ONESTEP(7);
1328
1857
 
1329
- blB += pk*RhsProgress;
1330
- blA += pk*2*Traits::LhsProgress;
1858
+ blB += int(pk) * int(RhsProgress);
1859
+ blA += int(pk) * 2 * int(Traits::LhsProgress);
1331
1860
 
1332
1861
  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1333
1862
  }
@@ -1344,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1344
1873
  ResPacket R0, R1;
1345
1874
  ResPacket alphav = pset1<ResPacket>(alpha);
1346
1875
 
1347
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1876
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1349
1878
  traits.acc(C0, alphav, R0);
1350
1879
  traits.acc(C4, alphav, R1);
1351
1880
  r0.storePacket(0 * Traits::ResPacketSize, R0);
@@ -1357,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1357
1886
  //---------- Process 1 * LhsProgress rows at once ----------
1358
1887
  if(mr>=1*Traits::LhsProgress)
1359
1888
  {
1360
- // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1361
- for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1362
- {
1363
- // loops on each largest micro vertical panel of rhs (depth * nr)
1364
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1365
- {
1366
- // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1367
- // stored into 1 x nr registers.
1368
-
1369
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1370
- prefetch(&blA[0]);
1371
-
1372
- // gets res block as register
1373
- AccPacket C0, C1, C2, C3;
1374
- traits.initAcc(C0);
1375
- traits.initAcc(C1);
1376
- traits.initAcc(C2);
1377
- traits.initAcc(C3);
1378
-
1379
- LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1380
- LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1381
- LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1382
- LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1383
-
1384
- r0.prefetch(prefetch_res_offset);
1385
- r1.prefetch(prefetch_res_offset);
1386
- r2.prefetch(prefetch_res_offset);
1387
- r3.prefetch(prefetch_res_offset);
1388
-
1389
- // performs "inner" products
1390
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1391
- prefetch(&blB[0]);
1392
- LhsPacket A0;
1393
-
1394
- for(Index k=0; k<peeled_kc; k+=pk)
1395
- {
1396
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1397
- RhsPacket B_0, B1, B2, B3;
1398
-
1399
- #define EIGEN_GEBGP_ONESTEP(K) \
1400
- do { \
1401
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405
- traits.madd(A0, B_0, C0, B_0); \
1406
- traits.madd(A0, B1, C1, B1); \
1407
- traits.madd(A0, B2, C2, B2); \
1408
- traits.madd(A0, B3, C3, B3); \
1409
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1410
- } while(false)
1411
-
1412
- internal::prefetch(blB+(48+0));
1413
- EIGEN_GEBGP_ONESTEP(0);
1414
- EIGEN_GEBGP_ONESTEP(1);
1415
- EIGEN_GEBGP_ONESTEP(2);
1416
- EIGEN_GEBGP_ONESTEP(3);
1417
- internal::prefetch(blB+(48+16));
1418
- EIGEN_GEBGP_ONESTEP(4);
1419
- EIGEN_GEBGP_ONESTEP(5);
1420
- EIGEN_GEBGP_ONESTEP(6);
1421
- EIGEN_GEBGP_ONESTEP(7);
1422
-
1423
- blB += pk*4*RhsProgress;
1424
- blA += pk*1*LhsProgress;
1425
-
1426
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1427
- }
1428
- // process remaining peeled loop
1429
- for(Index k=peeled_kc; k<depth; k++)
1430
- {
1431
- RhsPacket B_0, B1, B2, B3;
1432
- EIGEN_GEBGP_ONESTEP(0);
1433
- blB += 4*RhsProgress;
1434
- blA += 1*LhsProgress;
1435
- }
1436
- #undef EIGEN_GEBGP_ONESTEP
1437
-
1438
- ResPacket R0, R1;
1439
- ResPacket alphav = pset1<ResPacket>(alpha);
1440
-
1441
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442
- R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443
- traits.acc(C0, alphav, R0);
1444
- traits.acc(C1, alphav, R1);
1445
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1446
- r1.storePacket(0 * Traits::ResPacketSize, R1);
1447
-
1448
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449
- R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450
- traits.acc(C2, alphav, R0);
1451
- traits.acc(C3, alphav, R1);
1452
- r2.storePacket(0 * Traits::ResPacketSize, R0);
1453
- r3.storePacket(0 * Traits::ResPacketSize, R1);
1454
- }
1455
-
1456
- // Deal with remaining columns of the rhs
1457
- for(Index j2=packet_cols4; j2<cols; j2++)
1458
- {
1459
- // One column at a time
1460
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1461
- prefetch(&blA[0]);
1462
-
1463
- // gets res block as register
1464
- AccPacket C0;
1465
- traits.initAcc(C0);
1466
-
1467
- LinearMapper r0 = res.getLinearMapper(i, j2);
1468
-
1469
- // performs "inner" products
1470
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1471
- LhsPacket A0;
1472
-
1473
- for(Index k=0; k<peeled_kc; k+=pk)
1474
- {
1475
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1476
- RhsPacket B_0;
1477
-
1478
- #define EIGEN_GEBGP_ONESTEP(K) \
1479
- do { \
1480
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484
- traits.madd(A0, B_0, C0, B_0); \
1485
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1486
- } while(false);
1487
-
1488
- EIGEN_GEBGP_ONESTEP(0);
1489
- EIGEN_GEBGP_ONESTEP(1);
1490
- EIGEN_GEBGP_ONESTEP(2);
1491
- EIGEN_GEBGP_ONESTEP(3);
1492
- EIGEN_GEBGP_ONESTEP(4);
1493
- EIGEN_GEBGP_ONESTEP(5);
1494
- EIGEN_GEBGP_ONESTEP(6);
1495
- EIGEN_GEBGP_ONESTEP(7);
1496
-
1497
- blB += pk*RhsProgress;
1498
- blA += pk*1*Traits::LhsProgress;
1499
-
1500
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1501
- }
1502
-
1503
- // process remaining peeled loop
1504
- for(Index k=peeled_kc; k<depth; k++)
1505
- {
1506
- RhsPacket B_0;
1507
- EIGEN_GEBGP_ONESTEP(0);
1508
- blB += RhsProgress;
1509
- blA += 1*Traits::LhsProgress;
1510
- }
1511
- #undef EIGEN_GEBGP_ONESTEP
1512
- ResPacket R0;
1513
- ResPacket alphav = pset1<ResPacket>(alpha);
1514
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515
- traits.acc(C0, alphav, R0);
1516
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1517
- }
1518
- }
1889
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1891
+ }
1892
+ //---------- Process LhsProgressHalf rows at once ----------
1893
+ if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1894
+ {
1895
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1897
+ }
1898
+ //---------- Process LhsProgressQuarter rows at once ----------
1899
+ if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1900
+ {
1901
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1519
1903
  }
1520
1904
  //---------- Process remaining rows, 1 at once ----------
1521
- if(peeled_mc1<rows)
1905
+ if(peeled_mc_quarter<rows)
1522
1906
  {
1523
1907
  // loop on each panel of the rhs
1524
1908
  for(Index j2=0; j2<packet_cols4; j2+=nr)
1525
1909
  {
1526
1910
  // loop on each row of the lhs (1*LhsProgress x depth)
1527
- for(Index i=peeled_mc1; i<rows; i+=1)
1911
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
1528
1912
  {
1529
1913
  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1530
1914
  prefetch(&blA[0]);
1531
1915
  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1532
1916
 
1533
- // The following piece of code wont work for 512 bit registers
1534
- // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1535
- // as nr (which is currently 4) for the return type.
1917
+ // If LhsProgress is 8 or 16, it assumes that there is a
1918
+ // half or quarter packet, respectively, of the same size as
1919
+ // nr (which is currently 4) for the return type.
1536
1920
  const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1921
+ const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
1537
1922
  if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538
- (SwappedTraits::LhsProgress <= 8) &&
1539
- (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1923
+ (SwappedTraits::LhsProgress<=16) &&
1924
+ (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925
+ (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1540
1926
  {
1541
1927
  SAccPacket C0, C1, C2, C3;
1542
1928
  straits.initAcc(C0);
@@ -1559,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1559
1945
 
1560
1946
  straits.loadRhsQuad(blA+0*spk, B_0);
1561
1947
  straits.loadRhsQuad(blA+1*spk, B_1);
1562
- straits.madd(A0,B_0,C0,B_0);
1563
- straits.madd(A1,B_1,C1,B_1);
1948
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
1949
+ straits.madd(A1,B_1,C1,B_1, fix<0>);
1564
1950
 
1565
1951
  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1566
1952
  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1567
1953
  straits.loadRhsQuad(blA+2*spk, B_0);
1568
1954
  straits.loadRhsQuad(blA+3*spk, B_1);
1569
- straits.madd(A0,B_0,C2,B_0);
1570
- straits.madd(A1,B_1,C3,B_1);
1955
+ straits.madd(A0,B_0,C2,B_0, fix<0>);
1956
+ straits.madd(A1,B_1,C3,B_1, fix<0>);
1571
1957
 
1572
1958
  blB += 4*SwappedTraits::LhsProgress;
1573
1959
  blA += 4*spk;
@@ -1580,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1580
1966
 
1581
1967
  straits.loadLhsUnaligned(blB, A0);
1582
1968
  straits.loadRhsQuad(blA, B_0);
1583
- straits.madd(A0,B_0,C0,B_0);
1969
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
1584
1970
 
1585
1971
  blB += SwappedTraits::LhsProgress;
1586
1972
  blA += spk;
@@ -1590,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1590
1976
  // Special case where we have to first reduce the accumulation register C0
1591
1977
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1592
1978
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1593
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1979
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1594
1980
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1595
1981
 
1596
1982
  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
@@ -1603,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1603
1989
  SRhsPacketHalf b0;
1604
1990
  straits.loadLhsUnaligned(blB, a0);
1605
1991
  straits.loadRhs(blA, b0);
1606
- SAccPacketHalf c0 = predux_downto4(C0);
1607
- straits.madd(a0,b0,c0,b0);
1992
+ SAccPacketHalf c0 = predux_half_dowto4(C0);
1993
+ straits.madd(a0,b0,c0,b0, fix<0>);
1608
1994
  straits.acc(c0, alphav, R);
1609
1995
  }
1610
1996
  else
1611
1997
  {
1612
- straits.acc(predux_downto4(C0), alphav, R);
1998
+ straits.acc(predux_half_dowto4(C0), alphav, R);
1613
1999
  }
1614
2000
  res.scatterPacket(i, j2, R);
1615
2001
  }
2002
+ else if (SwappedTraits::LhsProgress==16)
2003
+ {
2004
+ // Special case where we have to first reduce the
2005
+ // accumulation register C0. We specialize the block in
2006
+ // template form, so that LhsProgress < 16 paths don't
2007
+ // fail to compile
2008
+ last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2009
+ p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2010
+ }
1616
2011
  else
1617
2012
  {
1618
2013
  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
@@ -1635,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1635
2030
 
1636
2031
  B_0 = blB[0];
1637
2032
  B_1 = blB[1];
1638
- CJMADD(cj,A0,B_0,C0, B_0);
1639
- CJMADD(cj,A0,B_1,C1, B_1);
1640
-
2033
+ C0 = cj.pmadd(A0,B_0,C0);
2034
+ C1 = cj.pmadd(A0,B_1,C1);
2035
+
1641
2036
  B_0 = blB[2];
1642
2037
  B_1 = blB[3];
1643
- CJMADD(cj,A0,B_0,C2, B_0);
1644
- CJMADD(cj,A0,B_1,C3, B_1);
1645
-
2038
+ C2 = cj.pmadd(A0,B_0,C2);
2039
+ C3 = cj.pmadd(A0,B_1,C3);
2040
+
1646
2041
  blB += 4;
1647
2042
  }
1648
2043
  res(i, j2 + 0) += alpha * C0;
@@ -1656,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1656
2051
  for(Index j2=packet_cols4; j2<cols; j2++)
1657
2052
  {
1658
2053
  // loop on each row of the lhs (1*LhsProgress x depth)
1659
- for(Index i=peeled_mc1; i<rows; i+=1)
2054
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
1660
2055
  {
1661
2056
  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1662
2057
  prefetch(&blA[0]);
@@ -1667,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1667
2062
  {
1668
2063
  LhsScalar A0 = blA[k];
1669
2064
  RhsScalar B_0 = blB[k];
1670
- CJMADD(cj, A0, B_0, C0, B_0);
2065
+ C0 = cj.pmadd(A0, B_0, C0);
1671
2066
  }
1672
2067
  res(i, j2) += alpha * C0;
1673
2068
  }
@@ -1676,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1676
2071
  }
1677
2072
 
1678
2073
 
1679
- #undef CJMADD
1680
-
1681
2074
  // pack a block of the lhs
1682
2075
  // The traversal is as follow (mr==4):
1683
2076
  // 0 4 8 12 ...
@@ -1692,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1692
2085
  //
1693
2086
  // 32 33 34 35 ...
1694
2087
  // 36 36 38 39 ...
1695
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1696
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
2088
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2089
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
1697
2090
  {
1698
2091
  typedef typename DataMapper::LinearMapper LinearMapper;
1699
2092
  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1700
2093
  };
1701
2094
 
1702
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1703
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
2095
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2096
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
1704
2097
  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1705
2098
  {
1706
- typedef typename packet_traits<Scalar>::type Packet;
1707
- enum { PacketSize = packet_traits<Scalar>::size };
2099
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2100
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101
+ enum { PacketSize = unpacket_traits<Packet>::size,
2102
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2105
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
1708
2106
 
1709
2107
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1710
2108
  EIGEN_UNUSED_VARIABLE(stride);
@@ -1716,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1716
2114
 
1717
2115
  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718
2116
  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720
- const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721
- : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
2117
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120
+ const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121
+ const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122
+ : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
1722
2123
 
1723
2124
  Index i=0;
1724
2125
 
@@ -1732,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1732
2133
  for(Index k=0; k<depth; k++)
1733
2134
  {
1734
2135
  Packet A, B, C;
1735
- A = lhs.loadPacket(i+0*PacketSize, k);
1736
- B = lhs.loadPacket(i+1*PacketSize, k);
1737
- C = lhs.loadPacket(i+2*PacketSize, k);
2136
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138
+ C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
1738
2139
  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1739
2140
  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1740
2141
  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1752,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1752
2153
  for(Index k=0; k<depth; k++)
1753
2154
  {
1754
2155
  Packet A, B;
1755
- A = lhs.loadPacket(i+0*PacketSize, k);
1756
- B = lhs.loadPacket(i+1*PacketSize, k);
2156
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
1757
2158
  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1758
2159
  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1759
2160
  }
@@ -1770,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1770
2171
  for(Index k=0; k<depth; k++)
1771
2172
  {
1772
2173
  Packet A;
1773
- A = lhs.loadPacket(i+0*PacketSize, k);
2174
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
1774
2175
  pstore(blockA+count, cj.pconj(A));
1775
2176
  count+=PacketSize;
1776
2177
  }
1777
2178
  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1778
2179
  }
1779
2180
  }
1780
- // Pack scalars
2181
+ // Pack half packets
2182
+ if(HasHalf && Pack1>=HalfPacketSize)
2183
+ {
2184
+ for(; i<peeled_mc_half; i+=HalfPacketSize)
2185
+ {
2186
+ if(PanelMode) count += (HalfPacketSize) * offset;
2187
+
2188
+ for(Index k=0; k<depth; k++)
2189
+ {
2190
+ HalfPacket A;
2191
+ A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192
+ pstoreu(blockA+count, cj.pconj(A));
2193
+ count+=HalfPacketSize;
2194
+ }
2195
+ if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2196
+ }
2197
+ }
2198
+ // Pack quarter packets
2199
+ if(HasQuarter && Pack1>=QuarterPacketSize)
2200
+ {
2201
+ for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2202
+ {
2203
+ if(PanelMode) count += (QuarterPacketSize) * offset;
2204
+
2205
+ for(Index k=0; k<depth; k++)
2206
+ {
2207
+ QuarterPacket A;
2208
+ A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209
+ pstoreu(blockA+count, cj.pconj(A));
2210
+ count+=QuarterPacketSize;
2211
+ }
2212
+ if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2213
+ }
2214
+ }
2215
+ // Pack2 may be *smaller* than PacketSize—that happens for
2216
+ // products like real * complex, where we have to go half the
2217
+ // progress on the lhs in order to duplicate those operands to
2218
+ // address both real & imaginary parts on the rhs. This portion will
2219
+ // pack those half ones until they match the number expected on the
2220
+ // last peeling loop at this point (for the rhs).
1781
2221
  if(Pack2<PacketSize && Pack2>1)
1782
2222
  {
1783
- for(; i<peeled_mc0; i+=Pack2)
2223
+ for(; i<peeled_mc0; i+=last_lhs_progress)
1784
2224
  {
1785
- if(PanelMode) count += Pack2 * offset;
2225
+ if(PanelMode) count += last_lhs_progress * offset;
1786
2226
 
1787
2227
  for(Index k=0; k<depth; k++)
1788
- for(Index w=0; w<Pack2; w++)
2228
+ for(Index w=0; w<last_lhs_progress; w++)
1789
2229
  blockA[count++] = cj(lhs(i+w, k));
1790
2230
 
1791
- if(PanelMode) count += Pack2 * (stride-offset-depth);
2231
+ if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
1792
2232
  }
1793
2233
  }
2234
+ // Pack scalars
1794
2235
  for(; i<rows; i++)
1795
2236
  {
1796
2237
  if(PanelMode) count += offset;
@@ -1800,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1800
2241
  }
1801
2242
  }
1802
2243
 
1803
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1804
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
2244
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2245
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
1805
2246
  {
1806
2247
  typedef typename DataMapper::LinearMapper LinearMapper;
1807
2248
  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1808
2249
  };
1809
2250
 
1810
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1811
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
2251
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2252
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
1812
2253
  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1813
2254
  {
1814
- typedef typename packet_traits<Scalar>::type Packet;
1815
- enum { PacketSize = packet_traits<Scalar>::size };
2255
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2256
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257
+ enum { PacketSize = unpacket_traits<Packet>::size,
2258
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2261
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
1816
2262
 
1817
2263
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1818
2264
  EIGEN_UNUSED_VARIABLE(stride);
@@ -1820,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1820
2266
  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1821
2267
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1822
2268
  Index count = 0;
2269
+ bool gone_half = false, gone_quarter = false, gone_last = false;
1823
2270
 
1824
- // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1825
- // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1826
- // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1827
-
1828
- int pack = Pack1;
1829
2271
  Index i = 0;
2272
+ int pack = Pack1;
2273
+ int psize = PacketSize;
1830
2274
  while(pack>0)
1831
2275
  {
1832
2276
  Index remaining_rows = rows-i;
1833
- Index peeled_mc = i+(remaining_rows/pack)*pack;
2277
+ Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278
+ Index starting_pos = i;
1834
2279
  for(; i<peeled_mc; i+=pack)
1835
2280
  {
1836
2281
  if(PanelMode) count += pack * offset;
1837
2282
 
1838
- const Index peeled_k = (depth/PacketSize)*PacketSize;
1839
2283
  Index k=0;
1840
- if(pack>=PacketSize)
2284
+ if(pack>=psize && psize >= QuarterPacketSize)
1841
2285
  {
1842
- for(; k<peeled_k; k+=PacketSize)
2286
+ const Index peeled_k = (depth/psize)*psize;
2287
+ for(; k<peeled_k; k+=psize)
1843
2288
  {
1844
- for (Index m = 0; m < pack; m += PacketSize)
2289
+ for (Index m = 0; m < pack; m += psize)
1845
2290
  {
1846
- PacketBlock<Packet> kernel;
1847
- for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1848
- ptranspose(kernel);
1849
- for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2291
+ if (psize == PacketSize) {
2292
+ PacketBlock<Packet> kernel;
2293
+ for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2294
+ ptranspose(kernel);
2295
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296
+ } else if (HasHalf && psize == HalfPacketSize) {
2297
+ gone_half = true;
2298
+ PacketBlock<HalfPacket> kernel_half;
2299
+ for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2300
+ ptranspose(kernel_half);
2301
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302
+ } else if (HasQuarter && psize == QuarterPacketSize) {
2303
+ gone_quarter = true;
2304
+ PacketBlock<QuarterPacket> kernel_quarter;
2305
+ for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2306
+ ptranspose(kernel_quarter);
2307
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2308
+ }
1850
2309
  }
1851
- count += PacketSize*pack;
2310
+ count += psize*pack;
1852
2311
  }
1853
2312
  }
2313
+
1854
2314
  for(; k<depth; k++)
1855
2315
  {
1856
2316
  Index w=0;
@@ -1873,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1873
2333
  if(PanelMode) count += pack * (stride-offset-depth);
1874
2334
  }
1875
2335
 
1876
- pack -= PacketSize;
1877
- if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1878
- pack = Pack2;
2336
+ pack -= psize;
2337
+ Index left = rows - i;
2338
+ if (pack <= 0) {
2339
+ if (!gone_last &&
2340
+ (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341
+ ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342
+ (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2343
+ psize /= 2;
2344
+ pack = psize;
2345
+ continue;
2346
+ }
2347
+ // Pack2 may be *smaller* than PacketSize—that happens for
2348
+ // products like real * complex, where we have to go half the
2349
+ // progress on the lhs in order to duplicate those operands to
2350
+ // address both real & imaginary parts on the rhs. This portion will
2351
+ // pack those half ones until they match the number expected on the
2352
+ // last peeling loop at this point (for the rhs).
2353
+ if (Pack2 < PacketSize && !gone_last) {
2354
+ gone_last = true;
2355
+ psize = pack = left & ~1;
2356
+ }
2357
+ }
1879
2358
  }
1880
2359
 
1881
2360
  for(; i<rows; i++)
@@ -1931,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
1931
2410
  // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1932
2411
  // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1933
2412
  // Index k=0;
1934
- // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
2413
+ // if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
1935
2414
  // {
1936
2415
  // for(; k<peeled_k; k+=PacketSize) {
1937
2416
  // PacketBlock<Packet> kernel;
@@ -1978,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
1978
2457
  {
1979
2458
  for(; k<peeled_k; k+=PacketSize) {
1980
2459
  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1981
- kernel.packet[0] = dm0.loadPacket(k);
1982
- kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1983
- kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1984
- kernel.packet[3%PacketSize] = dm3.loadPacket(k);
2460
+ kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461
+ kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462
+ kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463
+ kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
1985
2464
  ptranspose(kernel);
1986
2465
  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1987
2466
  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@@ -2022,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
2022
2501
  struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2023
2502
  {
2024
2503
  typedef typename packet_traits<Scalar>::type Packet;
2504
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2505
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2025
2506
  typedef typename DataMapper::LinearMapper LinearMapper;
2026
- enum { PacketSize = packet_traits<Scalar>::size };
2027
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2028
- };
2029
-
2030
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2031
- EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2032
- ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2033
- {
2034
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2035
- EIGEN_UNUSED_VARIABLE(stride);
2036
- EIGEN_UNUSED_VARIABLE(offset);
2037
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2038
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2039
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2041
- Index count = 0;
2042
-
2043
- // if(nr>=8)
2044
- // {
2045
- // for(Index j2=0; j2<packet_cols8; j2+=8)
2046
- // {
2047
- // // skip what we have before
2048
- // if(PanelMode) count += 8 * offset;
2049
- // for(Index k=0; k<depth; k++)
2050
- // {
2051
- // if (PacketSize==8) {
2052
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2053
- // pstoreu(blockB+count, cj.pconj(A));
2054
- // } else if (PacketSize==4) {
2055
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2056
- // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2057
- // pstoreu(blockB+count, cj.pconj(A));
2058
- // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2059
- // } else {
2060
- // const Scalar* b0 = &rhs[k*rhsStride + j2];
2061
- // blockB[count+0] = cj(b0[0]);
2062
- // blockB[count+1] = cj(b0[1]);
2063
- // blockB[count+2] = cj(b0[2]);
2064
- // blockB[count+3] = cj(b0[3]);
2065
- // blockB[count+4] = cj(b0[4]);
2066
- // blockB[count+5] = cj(b0[5]);
2067
- // blockB[count+6] = cj(b0[6]);
2068
- // blockB[count+7] = cj(b0[7]);
2069
- // }
2070
- // count += 8;
2071
- // }
2072
- // // skip what we have after
2073
- // if(PanelMode) count += 8 * (stride-offset-depth);
2074
- // }
2075
- // }
2076
- if(nr>=4)
2507
+ enum { PacketSize = packet_traits<Scalar>::size,
2508
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2509
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
2510
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2077
2511
  {
2078
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2512
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2513
+ EIGEN_UNUSED_VARIABLE(stride);
2514
+ EIGEN_UNUSED_VARIABLE(offset);
2515
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516
+ const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2517
+ const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2518
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2519
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2521
+ Index count = 0;
2522
+
2523
+ // if(nr>=8)
2524
+ // {
2525
+ // for(Index j2=0; j2<packet_cols8; j2+=8)
2526
+ // {
2527
+ // // skip what we have before
2528
+ // if(PanelMode) count += 8 * offset;
2529
+ // for(Index k=0; k<depth; k++)
2530
+ // {
2531
+ // if (PacketSize==8) {
2532
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2533
+ // pstoreu(blockB+count, cj.pconj(A));
2534
+ // } else if (PacketSize==4) {
2535
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2536
+ // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2537
+ // pstoreu(blockB+count, cj.pconj(A));
2538
+ // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2539
+ // } else {
2540
+ // const Scalar* b0 = &rhs[k*rhsStride + j2];
2541
+ // blockB[count+0] = cj(b0[0]);
2542
+ // blockB[count+1] = cj(b0[1]);
2543
+ // blockB[count+2] = cj(b0[2]);
2544
+ // blockB[count+3] = cj(b0[3]);
2545
+ // blockB[count+4] = cj(b0[4]);
2546
+ // blockB[count+5] = cj(b0[5]);
2547
+ // blockB[count+6] = cj(b0[6]);
2548
+ // blockB[count+7] = cj(b0[7]);
2549
+ // }
2550
+ // count += 8;
2551
+ // }
2552
+ // // skip what we have after
2553
+ // if(PanelMode) count += 8 * (stride-offset-depth);
2554
+ // }
2555
+ // }
2556
+ if(nr>=4)
2079
2557
  {
2080
- // skip what we have before
2081
- if(PanelMode) count += 4 * offset;
2082
- for(Index k=0; k<depth; k++)
2558
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2083
2559
  {
2084
- if (PacketSize==4) {
2085
- Packet A = rhs.loadPacket(k, j2);
2086
- pstoreu(blockB+count, cj.pconj(A));
2087
- count += PacketSize;
2088
- } else {
2089
- const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2090
- blockB[count+0] = cj(dm0(0));
2091
- blockB[count+1] = cj(dm0(1));
2092
- blockB[count+2] = cj(dm0(2));
2093
- blockB[count+3] = cj(dm0(3));
2094
- count += 4;
2560
+ // skip what we have before
2561
+ if(PanelMode) count += 4 * offset;
2562
+ for(Index k=0; k<depth; k++)
2563
+ {
2564
+ if (PacketSize==4) {
2565
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
2566
+ pstoreu(blockB+count, cj.pconj(A));
2567
+ count += PacketSize;
2568
+ } else if (HasHalf && HalfPacketSize==4) {
2569
+ HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570
+ pstoreu(blockB+count, cj.pconj(A));
2571
+ count += HalfPacketSize;
2572
+ } else if (HasQuarter && QuarterPacketSize==4) {
2573
+ QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574
+ pstoreu(blockB+count, cj.pconj(A));
2575
+ count += QuarterPacketSize;
2576
+ } else {
2577
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578
+ blockB[count+0] = cj(dm0(0));
2579
+ blockB[count+1] = cj(dm0(1));
2580
+ blockB[count+2] = cj(dm0(2));
2581
+ blockB[count+3] = cj(dm0(3));
2582
+ count += 4;
2583
+ }
2095
2584
  }
2585
+ // skip what we have after
2586
+ if(PanelMode) count += 4 * (stride-offset-depth);
2096
2587
  }
2097
- // skip what we have after
2098
- if(PanelMode) count += 4 * (stride-offset-depth);
2099
2588
  }
2100
- }
2101
- // copy the remaining columns one at a time (nr==1)
2102
- for(Index j2=packet_cols4; j2<cols; ++j2)
2103
- {
2104
- if(PanelMode) count += offset;
2105
- for(Index k=0; k<depth; k++)
2589
+ // copy the remaining columns one at a time (nr==1)
2590
+ for(Index j2=packet_cols4; j2<cols; ++j2)
2106
2591
  {
2107
- blockB[count] = cj(rhs(k, j2));
2108
- count += 1;
2592
+ if(PanelMode) count += offset;
2593
+ for(Index k=0; k<depth; k++)
2594
+ {
2595
+ blockB[count] = cj(rhs(k, j2));
2596
+ count += 1;
2597
+ }
2598
+ if(PanelMode) count += stride-offset-depth;
2109
2599
  }
2110
- if(PanelMode) count += stride-offset-depth;
2111
2600
  }
2112
- }
2601
+ };
2113
2602
 
2114
2603
  } // end namespace internal
2115
2604