@smake/eigen 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (283) hide show
  1. package/README.md +1 -1
  2. package/eigen/COPYING.APACHE +203 -0
  3. package/eigen/COPYING.BSD +1 -1
  4. package/eigen/COPYING.MINPACK +51 -52
  5. package/eigen/Eigen/Cholesky +0 -1
  6. package/eigen/Eigen/Core +108 -266
  7. package/eigen/Eigen/Eigenvalues +0 -1
  8. package/eigen/Eigen/Geometry +3 -6
  9. package/eigen/Eigen/Householder +0 -1
  10. package/eigen/Eigen/Jacobi +0 -1
  11. package/eigen/Eigen/KLUSupport +41 -0
  12. package/eigen/Eigen/LU +2 -5
  13. package/eigen/Eigen/OrderingMethods +0 -3
  14. package/eigen/Eigen/PaStiXSupport +1 -0
  15. package/eigen/Eigen/PardisoSupport +0 -0
  16. package/eigen/Eigen/QR +0 -1
  17. package/eigen/Eigen/QtAlignedMalloc +0 -1
  18. package/eigen/Eigen/SVD +0 -1
  19. package/eigen/Eigen/Sparse +0 -2
  20. package/eigen/Eigen/SparseCholesky +0 -8
  21. package/eigen/Eigen/SparseLU +4 -0
  22. package/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  23. package/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  24. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  25. package/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  26. package/eigen/Eigen/src/Core/Array.h +99 -11
  27. package/eigen/Eigen/src/Core/ArrayBase.h +1 -1
  28. package/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  29. package/eigen/Eigen/src/Core/Assign.h +1 -1
  30. package/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  31. package/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  32. package/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  33. package/eigen/Eigen/src/Core/Block.h +56 -60
  34. package/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  35. package/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  36. package/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  37. package/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  38. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  39. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  40. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  41. package/eigen/Eigen/src/Core/CwiseUnaryView.h +12 -10
  42. package/eigen/Eigen/src/Core/DenseBase.h +128 -39
  43. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  44. package/eigen/Eigen/src/Core/DenseStorage.h +150 -68
  45. package/eigen/Eigen/src/Core/Diagonal.h +21 -23
  46. package/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  47. package/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  48. package/eigen/Eigen/src/Core/Dot.h +10 -10
  49. package/eigen/Eigen/src/Core/EigenBase.h +10 -9
  50. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  51. package/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  52. package/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  53. package/eigen/Eigen/src/Core/GenericPacketMath.h +597 -147
  54. package/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  55. package/eigen/Eigen/src/Core/IO.h +40 -7
  56. package/eigen/Eigen/src/Core/IndexedView.h +237 -0
  57. package/eigen/Eigen/src/Core/Inverse.h +9 -10
  58. package/eigen/Eigen/src/Core/Map.h +7 -7
  59. package/eigen/Eigen/src/Core/MapBase.h +5 -3
  60. package/eigen/Eigen/src/Core/MathFunctions.h +756 -120
  61. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  62. package/eigen/Eigen/src/Core/Matrix.h +131 -25
  63. package/eigen/Eigen/src/Core/MatrixBase.h +19 -2
  64. package/eigen/Eigen/src/Core/NestByValue.h +25 -50
  65. package/eigen/Eigen/src/Core/NoAlias.h +4 -3
  66. package/eigen/Eigen/src/Core/NumTraits.h +107 -20
  67. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  68. package/eigen/Eigen/src/Core/PermutationMatrix.h +3 -3
  69. package/eigen/Eigen/src/Core/PlainObjectBase.h +145 -54
  70. package/eigen/Eigen/src/Core/Product.h +30 -25
  71. package/eigen/Eigen/src/Core/ProductEvaluators.h +183 -142
  72. package/eigen/Eigen/src/Core/Random.h +37 -1
  73. package/eigen/Eigen/src/Core/Redux.h +180 -170
  74. package/eigen/Eigen/src/Core/Ref.h +118 -21
  75. package/eigen/Eigen/src/Core/Replicate.h +8 -8
  76. package/eigen/Eigen/src/Core/Reshaped.h +454 -0
  77. package/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  78. package/eigen/Eigen/src/Core/Reverse.h +18 -12
  79. package/eigen/Eigen/src/Core/Select.h +8 -6
  80. package/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  81. package/eigen/Eigen/src/Core/Solve.h +14 -14
  82. package/eigen/Eigen/src/Core/SolveTriangular.h +13 -13
  83. package/eigen/Eigen/src/Core/SolverBase.h +41 -3
  84. package/eigen/Eigen/src/Core/StableNorm.h +100 -70
  85. package/eigen/Eigen/src/Core/StlIterators.h +463 -0
  86. package/eigen/Eigen/src/Core/Stride.h +9 -4
  87. package/eigen/Eigen/src/Core/Swap.h +5 -4
  88. package/eigen/Eigen/src/Core/Transpose.h +86 -27
  89. package/eigen/Eigen/src/Core/Transpositions.h +26 -8
  90. package/eigen/Eigen/src/Core/TriangularMatrix.h +88 -72
  91. package/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  92. package/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  93. package/eigen/Eigen/src/Core/Visitor.h +137 -29
  94. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  95. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  96. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  97. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  98. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  99. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +186 -213
  100. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1250 -252
  101. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  102. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  103. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  104. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  105. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  106. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  107. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  108. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  109. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  110. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  111. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  112. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  113. package/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  114. package/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  115. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  116. package/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  117. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  118. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  119. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  120. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  121. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  122. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  123. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  124. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  125. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  126. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  127. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  128. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  129. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  130. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  131. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  132. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  133. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  134. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  135. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  136. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  137. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  138. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  139. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  140. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  141. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  142. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  143. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  144. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  145. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  146. package/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  147. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +354 -15
  148. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1073 -585
  149. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +29 -7
  150. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +4 -4
  151. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +1 -1
  152. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  153. package/eigen/Eigen/src/Core/products/Parallelizer.h +23 -9
  154. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +23 -6
  155. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  156. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +2 -2
  157. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  158. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +3 -3
  159. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +5 -3
  160. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  161. package/eigen/Eigen/src/Core/util/BlasUtil.h +208 -124
  162. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  163. package/eigen/Eigen/src/Core/util/Constants.h +25 -9
  164. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +14 -2
  165. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +28 -4
  166. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  167. package/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  168. package/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  169. package/eigen/Eigen/src/Core/util/Macros.h +661 -250
  170. package/eigen/Eigen/src/Core/util/Memory.h +222 -52
  171. package/eigen/Eigen/src/Core/util/Meta.h +349 -105
  172. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  173. package/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  174. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  175. package/eigen/Eigen/src/Core/util/XprHelper.h +48 -30
  176. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  177. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +1 -1
  178. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  179. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  180. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  181. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  182. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  183. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +10 -5
  184. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +75 -42
  185. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  186. package/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  187. package/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  188. package/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  189. package/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  190. package/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  191. package/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  192. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  193. package/eigen/Eigen/src/Geometry/Quaternion.h +52 -14
  194. package/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  195. package/eigen/Eigen/src/Geometry/Scaling.h +22 -4
  196. package/eigen/Eigen/src/Geometry/Transform.h +86 -65
  197. package/eigen/Eigen/src/Geometry/Translation.h +6 -6
  198. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  199. package/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  200. package/eigen/Eigen/src/Householder/Householder.h +8 -4
  201. package/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  202. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  203. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  204. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  205. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  206. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  207. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  208. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  209. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  210. package/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  211. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  212. package/eigen/Eigen/src/LU/Determinant.h +35 -19
  213. package/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  214. package/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  215. package/eigen/Eigen/src/LU/PartialPivLU.h +67 -57
  216. package/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  217. package/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  218. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  219. package/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  220. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  221. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +10 -9
  222. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  223. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  224. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  225. package/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  226. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  227. package/eigen/Eigen/src/SVD/BDCSVD.h +137 -48
  228. package/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  229. package/eigen/Eigen/src/SVD/SVDBase.h +82 -21
  230. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  231. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +16 -8
  232. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +11 -36
  233. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  234. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  235. package/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  236. package/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  237. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  238. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  239. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +2 -2
  240. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  241. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +124 -10
  242. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  243. package/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  244. package/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  245. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +3 -0
  246. package/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  247. package/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  248. package/eigen/Eigen/src/SparseLU/SparseLU.h +160 -10
  249. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  250. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  251. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  252. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  253. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  254. package/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  255. package/eigen/Eigen/src/StlSupport/StdDeque.h +2 -14
  256. package/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  257. package/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  258. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  259. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  260. package/eigen/Eigen/src/misc/lapacke.h +5 -4
  261. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +27 -1
  262. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  263. package/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  264. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  265. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  266. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  267. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  268. package/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  269. package/eigen/README.md +2 -0
  270. package/lib/LibEigen.d.ts +4 -0
  271. package/lib/LibEigen.js +14 -0
  272. package/lib/index.d.ts +1 -1
  273. package/lib/index.js +7 -3
  274. package/package.json +2 -10
  275. package/eigen/Eigen/CMakeLists.txt +0 -19
  276. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  277. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  278. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  279. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  280. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  281. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  282. package/lib/eigen.d.ts +0 -2
  283. package/lib/eigen.js +0 -15
@@ -15,7 +15,13 @@ namespace Eigen {
15
15
 
16
16
  namespace internal {
17
17
 
18
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
18
+ enum GEBPPacketSizeType {
19
+ GEBPPacketFull = 0,
20
+ GEBPPacketHalf,
21
+ GEBPPacketQuarter
22
+ };
23
+
24
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
19
25
  class gebp_traits;
20
26
 
21
27
 
@@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
25
31
  return a<=0 ? b : a;
26
32
  }
27
33
 
34
+ #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36
+ #else
37
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38
+ #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39
+
40
+ #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42
+ #else
43
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44
+ #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45
+
46
+ #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
48
+ #else
49
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50
+ #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51
+
28
52
  #if EIGEN_ARCH_i386_OR_x86_64
29
- const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30
- const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31
- const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
53
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
56
+ #elif EIGEN_ARCH_PPC
57
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
32
60
  #else
33
- const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34
- const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35
- const std::ptrdiff_t defaultL3CacheSize = 512*1024;
61
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
36
64
  #endif
37
65
 
66
+ #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67
+ #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68
+ #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69
+
38
70
  /** \internal */
39
71
  struct CacheSizes {
40
72
  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
@@ -50,7 +82,6 @@ struct CacheSizes {
50
82
  std::ptrdiff_t m_l3;
51
83
  };
52
84
 
53
-
54
85
  /** \internal */
55
86
  inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56
87
  {
@@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
101
132
  // at the register level. This small horizontal panel has to stay within L1 cache.
102
133
  std::ptrdiff_t l1, l2, l3;
103
134
  manage_caching_sizes(GetAction, &l1, &l2, &l3);
135
+ #ifdef EIGEN_VECTORIZE_AVX512
136
+ // We need to find a rationale for that, but without this adjustment,
137
+ // performance with AVX512 is pretty bad, like -20% slower.
138
+ // One reason is that with increasing packet-size, the blocking size k
139
+ // has to become pretty small if we want that 1 lhs panel fit within L1.
140
+ // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
141
+ // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142
+ // This is quite small for a good reuse of the accumulation registers.
143
+ l1 *= 4;
144
+ #endif
104
145
 
105
146
  if (num_threads > 1) {
106
147
  typedef typename Traits::ResScalar ResScalar;
@@ -308,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
308
349
  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
309
350
  }
310
351
 
311
- #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
312
- #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
313
- #else
314
-
315
- // FIXME (a bit overkill maybe ?)
316
-
317
- template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
318
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
319
- {
320
- c = cj.pmadd(a,b,c);
321
- }
322
- };
323
-
324
- template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
325
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
326
- {
327
- t = b; t = cj.pmul(a,t); c = padd(c,t);
328
- }
329
- };
352
+ template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
353
+ struct RhsPanelHelper {
354
+ private:
355
+ static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
356
+ public:
357
+ typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
358
+ };
330
359
 
331
- template<typename CJ, typename A, typename B, typename C, typename T>
332
- EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
333
- {
334
- gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
335
- }
360
+ template <typename Packet>
361
+ struct QuadPacket
362
+ {
363
+ Packet B_0, B1, B2, B3;
364
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
365
+ const Packet& get(const FixedInt<1>&) const { return B1; }
366
+ const Packet& get(const FixedInt<2>&) const { return B2; }
367
+ const Packet& get(const FixedInt<3>&) const { return B3; }
368
+ };
336
369
 
337
- #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
338
- // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
339
- #endif
370
+ template <int N, typename T1, typename T2, typename T3>
371
+ struct packet_conditional { typedef T3 type; };
372
+
373
+ template <typename T1, typename T2, typename T3>
374
+ struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
375
+
376
+ template <typename T1, typename T2, typename T3>
377
+ struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
378
+
379
+ #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380
+ typedef typename packet_conditional<packet_size, \
381
+ typename packet_traits<name ## Scalar>::type, \
382
+ typename packet_traits<name ## Scalar>::half, \
383
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384
+ prefix ## name ## Packet
385
+
386
+ #define PACKET_DECL_COND(name, packet_size) \
387
+ typedef typename packet_conditional<packet_size, \
388
+ typename packet_traits<name ## Scalar>::type, \
389
+ typename packet_traits<name ## Scalar>::half, \
390
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391
+ name ## Packet
392
+
393
+ #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394
+ typedef typename packet_conditional<packet_size, \
395
+ typename packet_traits<Scalar>::type, \
396
+ typename packet_traits<Scalar>::half, \
397
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398
+ prefix ## ScalarPacket
399
+
400
+ #define PACKET_DECL_COND_SCALAR(packet_size) \
401
+ typedef typename packet_conditional<packet_size, \
402
+ typename packet_traits<Scalar>::type, \
403
+ typename packet_traits<Scalar>::half, \
404
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405
+ ScalarPacket
340
406
 
341
407
  /* Vectorization logic
342
408
  * real*real: unpack rhs to constant packets, ...
@@ -348,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
348
414
  * cplx*real : unpack rhs to constant packets, ...
349
415
  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
350
416
  */
351
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
417
+ template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
352
418
  class gebp_traits
353
419
  {
354
420
  public:
@@ -356,13 +422,17 @@ public:
356
422
  typedef _RhsScalar RhsScalar;
357
423
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
358
424
 
425
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
426
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
427
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
428
+
359
429
  enum {
360
430
  ConjLhs = _ConjLhs,
361
431
  ConjRhs = _ConjRhs,
362
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
363
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
364
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
365
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
432
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
433
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
434
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
435
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
366
436
 
367
437
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
368
438
 
@@ -371,10 +441,12 @@ public:
371
441
 
372
442
  // register block size along the M direction (currently, this one cannot be modified)
373
443
  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
374
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
375
- // we assume 16 registers
444
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
445
+ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
446
+ // we assume 16 registers or more
376
447
  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
377
448
  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
449
+ // Bug 1515: MSVC prior to v19.14 yields to register spilling.
378
450
  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
379
451
  #else
380
452
  mr = default_mr,
@@ -384,37 +456,41 @@ public:
384
456
  RhsProgress = 1
385
457
  };
386
458
 
387
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
388
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
389
- typedef typename packet_traits<ResScalar>::type _ResPacket;
390
459
 
391
460
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
392
461
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
393
462
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
463
+ typedef LhsPacket LhsPacket4Packing;
394
464
 
465
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
395
466
  typedef ResPacket AccPacket;
396
467
 
397
468
  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
398
469
  {
399
470
  p = pset1<ResPacket>(ResScalar(0));
400
471
  }
401
-
402
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
403
- {
404
- pbroadcast4(b, b0, b1, b2, b3);
405
- }
406
-
407
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
408
- // {
409
- // pbroadcast2(b, b0, b1);
410
- // }
411
-
472
+
412
473
  template<typename RhsPacketType>
413
474
  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
414
475
  {
415
476
  dest = pset1<RhsPacketType>(*b);
416
477
  }
417
-
478
+
479
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
480
+ {
481
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
482
+ }
483
+
484
+ template<typename RhsPacketType>
485
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
486
+ {
487
+ loadRhs(b, dest);
488
+ }
489
+
490
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
491
+ {
492
+ }
493
+
418
494
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
419
495
  {
420
496
  dest = ploadquad<RhsPacket>(b);
@@ -432,8 +508,8 @@ public:
432
508
  dest = ploadu<LhsPacketType>(a);
433
509
  }
434
510
 
435
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
436
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
511
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
512
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
437
513
  {
438
514
  conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
439
515
  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -448,6 +524,12 @@ public:
448
524
  #endif
449
525
  }
450
526
 
527
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
528
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
529
+ {
530
+ madd(a, b.get(lane), c, tmp, lane);
531
+ }
532
+
451
533
  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
452
534
  {
453
535
  r = pmadd(c,alpha,r);
@@ -461,21 +543,25 @@ public:
461
543
 
462
544
  };
463
545
 
464
- template<typename RealScalar, bool _ConjLhs>
465
- class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
546
+ template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
547
+ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
466
548
  {
467
549
  public:
468
550
  typedef std::complex<RealScalar> LhsScalar;
469
551
  typedef RealScalar RhsScalar;
470
552
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
471
553
 
554
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
555
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
556
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
557
+
472
558
  enum {
473
559
  ConjLhs = _ConjLhs,
474
560
  ConjRhs = false,
475
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
476
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
477
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
478
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
561
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
562
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
479
565
 
480
566
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
481
567
  nr = 4,
@@ -490,13 +576,12 @@ public:
490
576
  RhsProgress = 1
491
577
  };
492
578
 
493
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
494
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
495
- typedef typename packet_traits<ResScalar>::type _ResPacket;
496
-
497
579
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
498
580
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
499
581
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
582
+ typedef LhsPacket LhsPacket4Packing;
583
+
584
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
500
585
 
501
586
  typedef ResPacket AccPacket;
502
587
 
@@ -505,42 +590,64 @@ public:
505
590
  p = pset1<ResPacket>(ResScalar(0));
506
591
  }
507
592
 
508
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
593
+ template<typename RhsPacketType>
594
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
509
595
  {
510
- dest = pset1<RhsPacket>(*b);
596
+ dest = pset1<RhsPacketType>(*b);
597
+ }
598
+
599
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
600
+ {
601
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
511
602
  }
603
+
604
+ template<typename RhsPacketType>
605
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
606
+ {
607
+ loadRhs(b, dest);
608
+ }
609
+
610
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
611
+ {}
512
612
 
513
613
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
514
614
  {
515
- dest = pset1<RhsPacket>(*b);
615
+ loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
516
616
  }
517
617
 
518
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
618
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
519
619
  {
520
- dest = pload<LhsPacket>(a);
620
+ // FIXME we can do better!
621
+ // what we want here is a ploadheight
622
+ RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623
+ dest = ploadquad<RhsPacket>(tmp);
521
624
  }
522
625
 
523
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
626
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
524
627
  {
525
- dest = ploadu<LhsPacket>(a);
628
+ eigen_internal_assert(RhsPacketSize<=8);
629
+ dest = pset1<RhsPacket>(*b);
526
630
  }
527
631
 
528
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
632
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
529
633
  {
530
- pbroadcast4(b, b0, b1, b2, b3);
634
+ dest = pload<LhsPacket>(a);
531
635
  }
532
-
533
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
534
- // {
535
- // pbroadcast2(b, b0, b1);
536
- // }
537
636
 
538
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
637
+ template<typename LhsPacketType>
638
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
639
+ {
640
+ dest = ploadu<LhsPacketType>(a);
641
+ }
642
+
643
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
644
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
539
645
  {
540
646
  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
541
647
  }
542
648
 
543
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
649
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
650
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
544
651
  {
545
652
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
546
653
  EIGEN_UNUSED_VARIABLE(tmp);
@@ -555,13 +662,20 @@ public:
555
662
  c += a * b;
556
663
  }
557
664
 
558
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
665
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
666
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
559
667
  {
668
+ madd(a, b.get(lane), c, tmp, lane);
669
+ }
670
+
671
+ template <typename ResPacketType, typename AccPacketType>
672
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
673
+ {
674
+ conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
560
675
  r = cj.pmadd(c,alpha,r);
561
676
  }
562
677
 
563
678
  protected:
564
- conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
565
679
  };
566
680
 
567
681
  template<typename Packet>
@@ -580,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
580
694
  return res;
581
695
  }
582
696
 
697
+ // note that for DoublePacket<RealPacket> the "4" in "downto4"
698
+ // corresponds to the number of complexes, so it means "8"
699
+ // it terms of real coefficients.
700
+
583
701
  template<typename Packet>
584
- const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
702
+ const DoublePacket<Packet>&
703
+ predux_half_dowto4(const DoublePacket<Packet> &a,
704
+ typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
585
705
  {
586
706
  return a;
587
707
  }
588
708
 
589
- template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
709
+ template<typename Packet>
710
+ DoublePacket<typename unpacket_traits<Packet>::half>
711
+ predux_half_dowto4(const DoublePacket<Packet> &a,
712
+ typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
713
+ {
714
+ // yes, that's pretty hackish :(
715
+ DoublePacket<typename unpacket_traits<Packet>::half> res;
716
+ typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717
+ typedef typename packet_traits<Cplx>::type CplxPacket;
718
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719
+ res.second = predux_half_dowto4(CplxPacket(a.second)).v;
720
+ return res;
721
+ }
722
+
723
+ // same here, "quad" actually means "8" in terms of real coefficients
724
+ template<typename Scalar, typename RealPacket>
725
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
726
+ typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
727
+ {
728
+ dest.first = pset1<RealPacket>(numext::real(*b));
729
+ dest.second = pset1<RealPacket>(numext::imag(*b));
730
+ }
731
+
732
+ template<typename Scalar, typename RealPacket>
733
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
734
+ typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
735
+ {
736
+ // yes, that's pretty hackish too :(
737
+ typedef typename NumTraits<Scalar>::Real RealScalar;
738
+ RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739
+ RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740
+ dest.first = ploadquad<RealPacket>(r);
741
+ dest.second = ploadquad<RealPacket>(i);
742
+ }
743
+
744
+
745
+ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
746
+ typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
747
+ };
590
748
  // template<typename Packet>
591
749
  // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
592
750
  // {
@@ -596,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
596
754
  // return res;
597
755
  // }
598
756
 
599
- template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
600
- class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
757
+ template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
758
+ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
601
759
  {
602
760
  public:
603
761
  typedef std::complex<RealScalar> Scalar;
@@ -605,15 +763,21 @@ public:
605
763
  typedef std::complex<RealScalar> RhsScalar;
606
764
  typedef std::complex<RealScalar> ResScalar;
607
765
 
766
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
767
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
768
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
769
+ PACKET_DECL_COND(Real, _PacketSize);
770
+ PACKET_DECL_COND_SCALAR(_PacketSize);
771
+
608
772
  enum {
609
773
  ConjLhs = _ConjLhs,
610
774
  ConjRhs = _ConjRhs,
611
- Vectorizable = packet_traits<RealScalar>::Vectorizable
612
- && packet_traits<Scalar>::Vectorizable,
613
- RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
614
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
615
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
616
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
775
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable
776
+ && unpacket_traits<ScalarPacket>::vectorizable,
777
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
779
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
617
781
 
618
782
  // FIXME: should depend on NumberOfRegisters
619
783
  nr = 4,
@@ -623,14 +787,16 @@ public:
623
787
  RhsProgress = 1
624
788
  };
625
789
 
626
- typedef typename packet_traits<RealScalar>::type RealPacket;
627
- typedef typename packet_traits<Scalar>::type ScalarPacket;
628
- typedef DoublePacket<RealPacket> DoublePacketType;
790
+ typedef DoublePacket<RealPacket> DoublePacketType;
629
791
 
792
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
630
793
  typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
631
794
  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
632
795
  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
633
796
  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
797
+
798
+ // this actualy holds 8 packets!
799
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
634
800
 
635
801
  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
636
802
 
@@ -641,51 +807,49 @@ public:
641
807
  }
642
808
 
643
809
  // Scalar path
644
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
810
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
645
811
  {
646
- dest = pset1<ResPacket>(*b);
812
+ dest = pset1<ScalarPacket>(*b);
647
813
  }
648
814
 
649
815
  // Vectorized path
650
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
816
+ template<typename RealPacketType>
817
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
651
818
  {
652
- dest.first = pset1<RealPacket>(numext::real(*b));
653
- dest.second = pset1<RealPacket>(numext::imag(*b));
819
+ dest.first = pset1<RealPacketType>(numext::real(*b));
820
+ dest.second = pset1<RealPacketType>(numext::imag(*b));
654
821
  }
655
-
656
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
822
+
823
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
657
824
  {
658
- loadRhs(b,dest);
825
+ loadRhs(b, dest.B_0);
826
+ loadRhs(b + 1, dest.B1);
827
+ loadRhs(b + 2, dest.B2);
828
+ loadRhs(b + 3, dest.B3);
659
829
  }
660
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
830
+
831
+ // Scalar path
832
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
661
833
  {
662
- eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
663
- loadRhs(b,dest);
834
+ loadRhs(b, dest);
664
835
  }
665
-
666
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
836
+
837
+ // Vectorized path
838
+ template<typename RealPacketType>
839
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
667
840
  {
668
- // FIXME not sure that's the best way to implement it!
669
- loadRhs(b+0, b0);
670
- loadRhs(b+1, b1);
671
- loadRhs(b+2, b2);
672
- loadRhs(b+3, b3);
841
+ loadRhs(b, dest);
673
842
  }
843
+
844
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
674
845
 
675
- // Vectorized path
676
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
846
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
677
847
  {
678
- // FIXME not sure that's the best way to implement it!
679
- loadRhs(b+0, b0);
680
- loadRhs(b+1, b1);
848
+ loadRhs(b,dest);
681
849
  }
682
-
683
- // Scalar path
684
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
850
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
685
851
  {
686
- // FIXME not sure that's the best way to implement it!
687
- loadRhs(b+0, b0);
688
- loadRhs(b+1, b1);
852
+ loadQuadToDoublePacket(b,dest);
689
853
  }
690
854
 
691
855
  // nothing special here
@@ -694,47 +858,59 @@ public:
694
858
  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
695
859
  }
696
860
 
697
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
861
+ template<typename LhsPacketType>
862
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
698
863
  {
699
- dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
864
+ dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
700
865
  }
701
866
 
702
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
867
+ template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
868
+ EIGEN_STRONG_INLINE
869
+ typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
870
+ madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
703
871
  {
704
872
  c.first = padd(pmul(a,b.first), c.first);
705
873
  c.second = padd(pmul(a,b.second),c.second);
706
874
  }
707
875
 
708
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
876
+ template<typename LaneIdType>
877
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
709
878
  {
710
879
  c = cj.pmadd(a,b,c);
711
880
  }
881
+
882
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
883
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
884
+ {
885
+ madd(a, b.get(lane), c, tmp, lane);
886
+ }
712
887
 
713
888
  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
714
889
 
715
- EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
890
+ template<typename RealPacketType, typename ResPacketType>
891
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
716
892
  {
717
893
  // assemble c
718
- ResPacket tmp;
894
+ ResPacketType tmp;
719
895
  if((!ConjLhs)&&(!ConjRhs))
720
896
  {
721
- tmp = pcplxflip(pconj(ResPacket(c.second)));
722
- tmp = padd(ResPacket(c.first),tmp);
897
+ tmp = pcplxflip(pconj(ResPacketType(c.second)));
898
+ tmp = padd(ResPacketType(c.first),tmp);
723
899
  }
724
900
  else if((!ConjLhs)&&(ConjRhs))
725
901
  {
726
- tmp = pconj(pcplxflip(ResPacket(c.second)));
727
- tmp = padd(ResPacket(c.first),tmp);
902
+ tmp = pconj(pcplxflip(ResPacketType(c.second)));
903
+ tmp = padd(ResPacketType(c.first),tmp);
728
904
  }
729
905
  else if((ConjLhs)&&(!ConjRhs))
730
906
  {
731
- tmp = pcplxflip(ResPacket(c.second));
732
- tmp = padd(pconj(ResPacket(c.first)),tmp);
907
+ tmp = pcplxflip(ResPacketType(c.second));
908
+ tmp = padd(pconj(ResPacketType(c.first)),tmp);
733
909
  }
734
910
  else if((ConjLhs)&&(ConjRhs))
735
911
  {
736
- tmp = pcplxflip(ResPacket(c.second));
737
- tmp = psub(pconj(ResPacket(c.first)),tmp);
912
+ tmp = pcplxflip(ResPacketType(c.second));
913
+ tmp = psub(pconj(ResPacketType(c.first)),tmp);
738
914
  }
739
915
 
740
916
  r = pmadd(tmp,alpha,r);
@@ -744,8 +920,8 @@ protected:
744
920
  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
745
921
  };
746
922
 
747
- template<typename RealScalar, bool _ConjRhs>
748
- class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
923
+ template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
924
+ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
749
925
  {
750
926
  public:
751
927
  typedef std::complex<RealScalar> Scalar;
@@ -753,14 +929,25 @@ public:
753
929
  typedef Scalar RhsScalar;
754
930
  typedef Scalar ResScalar;
755
931
 
932
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
933
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
934
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
935
+ PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
936
+ PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
937
+
938
+ #undef PACKET_DECL_COND_SCALAR_PREFIX
939
+ #undef PACKET_DECL_COND_PREFIX
940
+ #undef PACKET_DECL_COND_SCALAR
941
+ #undef PACKET_DECL_COND
942
+
756
943
  enum {
757
944
  ConjLhs = false,
758
945
  ConjRhs = _ConjRhs,
759
- Vectorizable = packet_traits<RealScalar>::Vectorizable
760
- && packet_traits<Scalar>::Vectorizable,
761
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
762
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
763
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
946
+ Vectorizable = unpacket_traits<_RealPacket>::vectorizable
947
+ && unpacket_traits<_ScalarPacket>::vectorizable,
948
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
764
951
 
765
952
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
766
953
  // FIXME: should depend on NumberOfRegisters
@@ -771,14 +958,11 @@ public:
771
958
  RhsProgress = 1
772
959
  };
773
960
 
774
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
775
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
776
- typedef typename packet_traits<ResScalar>::type _ResPacket;
777
-
778
961
  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
779
962
  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
780
963
  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
781
-
964
+ typedef LhsPacket LhsPacket4Packing;
965
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
782
966
  typedef ResPacket AccPacket;
783
967
 
784
968
  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -786,22 +970,25 @@ public:
786
970
  p = pset1<ResPacket>(ResScalar(0));
787
971
  }
788
972
 
789
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
973
+ template<typename RhsPacketType>
974
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
790
975
  {
791
- dest = pset1<RhsPacket>(*b);
976
+ dest = pset1<RhsPacketType>(*b);
792
977
  }
793
-
794
- void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
978
+
979
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
795
980
  {
796
- pbroadcast4(b, b0, b1, b2, b3);
981
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
797
982
  }
798
-
799
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
800
- // {
801
- // // FIXME not sure that's the best way to implement it!
802
- // b0 = pload1<RhsPacket>(b+0);
803
- // b1 = pload1<RhsPacket>(b+1);
804
- // }
983
+
984
+ template<typename RhsPacketType>
985
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
986
+ {
987
+ loadRhs(b, dest);
988
+ }
989
+
990
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
991
+ {}
805
992
 
806
993
  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
807
994
  {
@@ -810,21 +997,23 @@ public:
810
997
 
811
998
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
812
999
  {
813
- eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
814
- loadRhs(b,dest);
1000
+ dest = ploadquad<RhsPacket>(b);
815
1001
  }
816
1002
 
817
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
1003
+ template<typename LhsPacketType>
1004
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
818
1005
  {
819
- dest = ploaddup<LhsPacket>(a);
1006
+ dest = ploaddup<LhsPacketType>(a);
820
1007
  }
821
1008
 
822
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
1009
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1010
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
823
1011
  {
824
1012
  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
825
1013
  }
826
1014
 
827
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
1015
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1016
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
828
1017
  {
829
1018
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
830
1019
  EIGEN_UNUSED_VARIABLE(tmp);
@@ -840,16 +1029,24 @@ public:
840
1029
  c += a * b;
841
1030
  }
842
1031
 
843
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
1032
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1033
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1034
+ {
1035
+ madd(a, b.get(lane), c, tmp, lane);
1036
+ }
1037
+
1038
+ template <typename ResPacketType, typename AccPacketType>
1039
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
844
1040
  {
1041
+ conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
845
1042
  r = cj.pmadd(alpha,c,r);
846
1043
  }
847
1044
 
848
1045
  protected:
849
- conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
1046
+
850
1047
  };
851
1048
 
852
- /* optimized GEneral packed Block * packed Panel product kernel
1049
+ /* optimized General packed Block * packed Panel product kernel
853
1050
  *
854
1051
  * Mixing type logic: C += A * B
855
1052
  * | A | B | comments
@@ -859,26 +1056,47 @@ protected:
859
1056
  template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
860
1057
  struct gebp_kernel
861
1058
  {
862
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
1059
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1060
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1061
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1062
+
863
1063
  typedef typename Traits::ResScalar ResScalar;
864
1064
  typedef typename Traits::LhsPacket LhsPacket;
865
1065
  typedef typename Traits::RhsPacket RhsPacket;
866
1066
  typedef typename Traits::ResPacket ResPacket;
867
1067
  typedef typename Traits::AccPacket AccPacket;
1068
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
1069
+
1070
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1071
+
1072
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
868
1073
 
869
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
870
1074
  typedef typename SwappedTraits::ResScalar SResScalar;
871
1075
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
872
1076
  typedef typename SwappedTraits::RhsPacket SRhsPacket;
873
1077
  typedef typename SwappedTraits::ResPacket SResPacket;
874
1078
  typedef typename SwappedTraits::AccPacket SAccPacket;
875
1079
 
1080
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1081
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1082
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
1083
+ typedef typename HalfTraits::AccPacket AccPacketHalf;
1084
+
1085
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1086
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1087
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1088
+ typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1089
+
876
1090
  typedef typename DataMapper::LinearMapper LinearMapper;
877
1091
 
878
1092
  enum {
879
1093
  Vectorizable = Traits::Vectorizable,
880
1094
  LhsProgress = Traits::LhsProgress,
1095
+ LhsProgressHalf = HalfTraits::LhsProgress,
1096
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
881
1097
  RhsProgress = Traits::RhsProgress,
1098
+ RhsProgressHalf = HalfTraits::RhsProgress,
1099
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
882
1100
  ResPacketSize = Traits::ResPacketSize
883
1101
  };
884
1102
 
@@ -888,6 +1106,299 @@ struct gebp_kernel
888
1106
  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
889
1107
  };
890
1108
 
1109
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1110
+ int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1111
+ struct last_row_process_16_packets
1112
+ {
1113
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1114
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1115
+
1116
+ typedef typename Traits::ResScalar ResScalar;
1117
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1118
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1119
+ typedef typename SwappedTraits::ResPacket SResPacket;
1120
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1121
+
1122
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1123
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1124
+ ResScalar alpha, SAccPacket &C0)
1125
+ {
1126
+ EIGEN_UNUSED_VARIABLE(res);
1127
+ EIGEN_UNUSED_VARIABLE(straits);
1128
+ EIGEN_UNUSED_VARIABLE(blA);
1129
+ EIGEN_UNUSED_VARIABLE(blB);
1130
+ EIGEN_UNUSED_VARIABLE(depth);
1131
+ EIGEN_UNUSED_VARIABLE(endk);
1132
+ EIGEN_UNUSED_VARIABLE(i);
1133
+ EIGEN_UNUSED_VARIABLE(j2);
1134
+ EIGEN_UNUSED_VARIABLE(alpha);
1135
+ EIGEN_UNUSED_VARIABLE(C0);
1136
+ }
1137
+ };
1138
+
1139
+
1140
+ template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1141
+ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1142
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1143
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1144
+
1145
+ typedef typename Traits::ResScalar ResScalar;
1146
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1147
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1148
+ typedef typename SwappedTraits::ResPacket SResPacket;
1149
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1150
+
1151
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1152
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1153
+ ResScalar alpha, SAccPacket &C0)
1154
+ {
1155
+ typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1156
+ typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1157
+ typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1158
+ typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1159
+
1160
+ SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161
+ SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1162
+
1163
+ if (depth - endk > 0)
1164
+ {
1165
+ // We have to handle the last row(s) of the rhs, which
1166
+ // correspond to a half-packet
1167
+ SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1168
+
1169
+ for (Index kk = endk; kk < depth; kk++)
1170
+ {
1171
+ SLhsPacketQuarter a0;
1172
+ SRhsPacketQuarter b0;
1173
+ straits.loadLhsUnaligned(blB, a0);
1174
+ straits.loadRhs(blA, b0);
1175
+ straits.madd(a0,b0,c0,b0, fix<0>);
1176
+ blB += SwappedTraits::LhsProgress/4;
1177
+ blA += 1;
1178
+ }
1179
+ straits.acc(c0, alphav, R);
1180
+ }
1181
+ else
1182
+ {
1183
+ straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1184
+ }
1185
+ res.scatterPacket(i, j2, R);
1186
+ }
1187
+ };
1188
+
1189
+ template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1190
+ struct lhs_process_one_packet
1191
+ {
1192
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1193
+
1194
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1195
+ {
1196
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1197
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1198
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205
+ __asm__ ("" : "+x,m" (*A0));
1206
+ #endif
1207
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1208
+ }
1209
+
1210
+ EIGEN_STRONG_INLINE void operator()(
1211
+ const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1212
+ Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213
+ int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1214
+ {
1215
+ GEBPTraits traits;
1216
+
1217
+ // loops on each largest micro horizontal panel of lhs
1218
+ // (LhsProgress x depth)
1219
+ for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1220
+ {
1221
+ // loops on each largest micro vertical panel of rhs (depth * nr)
1222
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
1223
+ {
1224
+ // We select a LhsProgress x nr micro block of res
1225
+ // which is entirely stored into 1 x nr registers.
1226
+
1227
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1228
+ prefetch(&blA[0]);
1229
+
1230
+ // gets res block as register
1231
+ AccPacket C0, C1, C2, C3;
1232
+ traits.initAcc(C0);
1233
+ traits.initAcc(C1);
1234
+ traits.initAcc(C2);
1235
+ traits.initAcc(C3);
1236
+ // To improve instruction pipelining, let's double the accumulation registers:
1237
+ // even k will accumulate in C*, while odd k will accumulate in D*.
1238
+ // This trick is crutial to get good performance with FMA, otherwise it is
1239
+ // actually faster to perform separated MUL+ADD because of a naturally
1240
+ // better instruction-level parallelism.
1241
+ AccPacket D0, D1, D2, D3;
1242
+ traits.initAcc(D0);
1243
+ traits.initAcc(D1);
1244
+ traits.initAcc(D2);
1245
+ traits.initAcc(D3);
1246
+
1247
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1251
+
1252
+ r0.prefetch(prefetch_res_offset);
1253
+ r1.prefetch(prefetch_res_offset);
1254
+ r2.prefetch(prefetch_res_offset);
1255
+ r3.prefetch(prefetch_res_offset);
1256
+
1257
+ // performs "inner" products
1258
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1259
+ prefetch(&blB[0]);
1260
+ LhsPacket A0, A1;
1261
+
1262
+ for(Index k=0; k<peeled_kc; k+=pk)
1263
+ {
1264
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1265
+ RhsPacketx4 rhs_panel;
1266
+ RhsPacket T0;
1267
+
1268
+ internal::prefetch(blB+(48+0));
1269
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270
+ peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272
+ peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273
+ internal::prefetch(blB+(48+16));
1274
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275
+ peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277
+ peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1278
+
1279
+ blB += pk*4*RhsProgress;
1280
+ blA += pk*LhsProgress;
1281
+
1282
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1283
+ }
1284
+ C0 = padd(C0,D0);
1285
+ C1 = padd(C1,D1);
1286
+ C2 = padd(C2,D2);
1287
+ C3 = padd(C3,D3);
1288
+
1289
+ // process remaining peeled loop
1290
+ for(Index k=peeled_kc; k<depth; k++)
1291
+ {
1292
+ RhsPacketx4 rhs_panel;
1293
+ RhsPacket T0;
1294
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295
+ blB += 4*RhsProgress;
1296
+ blA += LhsProgress;
1297
+ }
1298
+
1299
+ ResPacket R0, R1;
1300
+ ResPacket alphav = pset1<ResPacket>(alpha);
1301
+
1302
+ R0 = r0.template loadPacket<ResPacket>(0);
1303
+ R1 = r1.template loadPacket<ResPacket>(0);
1304
+ traits.acc(C0, alphav, R0);
1305
+ traits.acc(C1, alphav, R1);
1306
+ r0.storePacket(0, R0);
1307
+ r1.storePacket(0, R1);
1308
+
1309
+ R0 = r2.template loadPacket<ResPacket>(0);
1310
+ R1 = r3.template loadPacket<ResPacket>(0);
1311
+ traits.acc(C2, alphav, R0);
1312
+ traits.acc(C3, alphav, R1);
1313
+ r2.storePacket(0, R0);
1314
+ r3.storePacket(0, R1);
1315
+ }
1316
+
1317
+ // Deal with remaining columns of the rhs
1318
+ for(Index j2=packet_cols4; j2<cols; j2++)
1319
+ {
1320
+ // One column at a time
1321
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1322
+ prefetch(&blA[0]);
1323
+
1324
+ // gets res block as register
1325
+ AccPacket C0;
1326
+ traits.initAcc(C0);
1327
+
1328
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1329
+
1330
+ // performs "inner" products
1331
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1332
+ LhsPacket A0;
1333
+
1334
+ for(Index k= 0; k<peeled_kc; k+=pk)
1335
+ {
1336
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1337
+ RhsPacket B_0;
1338
+
1339
+ #define EIGEN_GEBGP_ONESTEP(K) \
1340
+ do { \
1341
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1343
+ /* FIXME: why unaligned???? */ \
1344
+ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1348
+ } while(false);
1349
+
1350
+ EIGEN_GEBGP_ONESTEP(0);
1351
+ EIGEN_GEBGP_ONESTEP(1);
1352
+ EIGEN_GEBGP_ONESTEP(2);
1353
+ EIGEN_GEBGP_ONESTEP(3);
1354
+ EIGEN_GEBGP_ONESTEP(4);
1355
+ EIGEN_GEBGP_ONESTEP(5);
1356
+ EIGEN_GEBGP_ONESTEP(6);
1357
+ EIGEN_GEBGP_ONESTEP(7);
1358
+
1359
+ blB += pk*RhsProgress;
1360
+ blA += pk*LhsProgress;
1361
+
1362
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1363
+ }
1364
+
1365
+ // process remaining peeled loop
1366
+ for(Index k=peeled_kc; k<depth; k++)
1367
+ {
1368
+ RhsPacket B_0;
1369
+ EIGEN_GEBGP_ONESTEP(0);
1370
+ blB += RhsProgress;
1371
+ blA += LhsProgress;
1372
+ }
1373
+ #undef EIGEN_GEBGP_ONESTEP
1374
+ ResPacket R0;
1375
+ ResPacket alphav = pset1<ResPacket>(alpha);
1376
+ R0 = r0.template loadPacket<ResPacket>(0);
1377
+ traits.acc(C0, alphav, R0);
1378
+ r0.storePacket(0, R0);
1379
+ }
1380
+ }
1381
+ }
1382
+ };
1383
+
1384
+ template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1385
+ struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1386
+ {
1387
+
1388
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1389
+ {
1390
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1391
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1392
+ traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394
+ traits.madd(*A0, *B_0, *C0, *B_0);
1395
+ traits.madd(*A0, *B1, *C1, *B1);
1396
+ traits.madd(*A0, *B2, *C2, *B2);
1397
+ traits.madd(*A0, *B3, *C3, *B3);
1398
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1399
+ }
1400
+ };
1401
+
891
1402
  template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
892
1403
  EIGEN_DONT_INLINE
893
1404
  void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@@ -904,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
904
1415
  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
905
1416
  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
906
1417
  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
907
- const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
1418
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419
+ const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420
+ const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
908
1421
  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
909
1422
  const Index peeled_kc = depth & ~(pk-1);
910
- const Index prefetch_res_offset = 32/sizeof(ResScalar);
1423
+ const int prefetch_res_offset = 32/sizeof(ResScalar);
911
1424
  // const Index depth2 = depth & ~1;
912
1425
 
913
1426
  //---------- Process 3 * LhsProgress rows at once ----------
@@ -965,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
965
1478
  for(Index k=0; k<peeled_kc; k+=pk)
966
1479
  {
967
1480
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
968
- RhsPacket B_0, T0;
1481
+ // 15 registers are taken (12 for acc, 2 for lhs).
1482
+ RhsPanel15 rhs_panel;
1483
+ RhsPacket T0;
969
1484
  LhsPacket A2;
970
-
971
- #define EIGEN_GEBP_ONESTEP(K) \
972
- do { \
973
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1485
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1486
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1487
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1488
+ // which is not good for pipelining
1489
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1490
+ #else
1491
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1492
+ #endif
1493
+ #define EIGEN_GEBP_ONESTEP(K) \
1494
+ do { \
1495
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
974
1496
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
975
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
976
- if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
977
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
978
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
979
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
980
- traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
981
- traits.madd(A0, B_0, C0, T0); \
982
- traits.madd(A1, B_0, C4, T0); \
983
- traits.madd(A2, B_0, C8, B_0); \
984
- traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
985
- traits.madd(A0, B_0, C1, T0); \
986
- traits.madd(A1, B_0, C5, T0); \
987
- traits.madd(A2, B_0, C9, B_0); \
988
- traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
989
- traits.madd(A0, B_0, C2, T0); \
990
- traits.madd(A1, B_0, C6, T0); \
991
- traits.madd(A2, B_0, C10, B_0); \
992
- traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
993
- traits.madd(A0, B_0, C3 , T0); \
994
- traits.madd(A1, B_0, C7, T0); \
995
- traits.madd(A2, B_0, C11, B_0); \
996
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
997
- } while(false)
1497
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1500
+ } /* Bug 953 */ \
1501
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505
+ traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509
+ traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513
+ traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517
+ traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1522
+ } while (false)
998
1523
 
999
1524
  internal::prefetch(blB);
1000
1525
  EIGEN_GEBP_ONESTEP(0);
@@ -1014,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1014
1539
  // process remaining peeled loop
1015
1540
  for(Index k=peeled_kc; k<depth; k++)
1016
1541
  {
1017
- RhsPacket B_0, T0;
1542
+ RhsPanel15 rhs_panel;
1543
+ RhsPacket T0;
1018
1544
  LhsPacket A2;
1019
1545
  EIGEN_GEBP_ONESTEP(0);
1020
1546
  blB += 4*RhsProgress;
@@ -1026,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1026
1552
  ResPacket R0, R1, R2;
1027
1553
  ResPacket alphav = pset1<ResPacket>(alpha);
1028
1554
 
1029
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1030
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1031
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1555
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1032
1558
  traits.acc(C0, alphav, R0);
1033
1559
  traits.acc(C4, alphav, R1);
1034
1560
  traits.acc(C8, alphav, R2);
@@ -1036,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1036
1562
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1037
1563
  r0.storePacket(2 * Traits::ResPacketSize, R2);
1038
1564
 
1039
- R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1040
- R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1041
- R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1565
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1042
1568
  traits.acc(C1, alphav, R0);
1043
1569
  traits.acc(C5, alphav, R1);
1044
1570
  traits.acc(C9, alphav, R2);
@@ -1046,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1046
1572
  r1.storePacket(1 * Traits::ResPacketSize, R1);
1047
1573
  r1.storePacket(2 * Traits::ResPacketSize, R2);
1048
1574
 
1049
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1050
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1051
- R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1575
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1052
1578
  traits.acc(C2, alphav, R0);
1053
1579
  traits.acc(C6, alphav, R1);
1054
1580
  traits.acc(C10, alphav, R2);
@@ -1056,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1056
1582
  r2.storePacket(1 * Traits::ResPacketSize, R1);
1057
1583
  r2.storePacket(2 * Traits::ResPacketSize, R2);
1058
1584
 
1059
- R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1060
- R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1061
- R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1585
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1062
1588
  traits.acc(C3, alphav, R0);
1063
1589
  traits.acc(C7, alphav, R1);
1064
1590
  traits.acc(C11, alphav, R2);
@@ -1094,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1094
1620
  {
1095
1621
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1096
1622
  RhsPacket B_0;
1097
- #define EIGEN_GEBGP_ONESTEP(K) \
1098
- do { \
1099
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1623
+ #define EIGEN_GEBGP_ONESTEP(K) \
1624
+ do { \
1625
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1100
1626
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1101
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1102
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1103
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1104
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1105
- traits.madd(A0, B_0, C0, B_0); \
1106
- traits.madd(A1, B_0, C4, B_0); \
1107
- traits.madd(A2, B_0, C8, B_0); \
1108
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1109
- } while(false)
1110
-
1627
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1635
+ } while (false)
1636
+
1111
1637
  EIGEN_GEBGP_ONESTEP(0);
1112
1638
  EIGEN_GEBGP_ONESTEP(1);
1113
1639
  EIGEN_GEBGP_ONESTEP(2);
@@ -1117,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1117
1643
  EIGEN_GEBGP_ONESTEP(6);
1118
1644
  EIGEN_GEBGP_ONESTEP(7);
1119
1645
 
1120
- blB += pk*RhsProgress;
1121
- blA += pk*3*Traits::LhsProgress;
1646
+ blB += int(pk) * int(RhsProgress);
1647
+ blA += int(pk) * 3 * int(Traits::LhsProgress);
1122
1648
 
1123
1649
  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1124
1650
  }
@@ -1135,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1135
1661
  ResPacket R0, R1, R2;
1136
1662
  ResPacket alphav = pset1<ResPacket>(alpha);
1137
1663
 
1138
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1139
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1140
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1664
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1141
1667
  traits.acc(C0, alphav, R0);
1142
1668
  traits.acc(C4, alphav, R1);
1143
1669
  traits.acc(C8, alphav, R2);
@@ -1196,7 +1722,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1196
1722
  for(Index k=0; k<peeled_kc; k+=pk)
1197
1723
  {
1198
1724
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1199
- RhsPacket B_0, B1, B2, B3, T0;
1725
+ RhsPacketx4 rhs_panel;
1726
+ RhsPacket T0;
1200
1727
 
1201
1728
  // NOTE: the begin/end asm comments below work around bug 935!
1202
1729
  // but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1205,24 +1732,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1205
1732
  #else
1206
1733
  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1207
1734
  #endif
1208
- #define EIGEN_GEBGP_ONESTEP(K) \
1209
- do { \
1210
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1211
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1212
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1213
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1214
- traits.madd(A0, B_0, C0, T0); \
1215
- traits.madd(A1, B_0, C4, B_0); \
1216
- traits.madd(A0, B1, C1, T0); \
1217
- traits.madd(A1, B1, C5, B1); \
1218
- traits.madd(A0, B2, C2, T0); \
1219
- traits.madd(A1, B2, C6, B2); \
1220
- traits.madd(A0, B3, C3, T0); \
1221
- traits.madd(A1, B3, C7, B3); \
1222
- EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1223
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1224
- } while(false)
1225
-
1735
+ #define EIGEN_GEBGP_ONESTEP(K) \
1736
+ do { \
1737
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1751
+ } while (false)
1752
+
1226
1753
  internal::prefetch(blB+(48+0));
1227
1754
  EIGEN_GEBGP_ONESTEP(0);
1228
1755
  EIGEN_GEBGP_ONESTEP(1);
@@ -1242,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1242
1769
  // process remaining peeled loop
1243
1770
  for(Index k=peeled_kc; k<depth; k++)
1244
1771
  {
1245
- RhsPacket B_0, B1, B2, B3, T0;
1772
+ RhsPacketx4 rhs_panel;
1773
+ RhsPacket T0;
1246
1774
  EIGEN_GEBGP_ONESTEP(0);
1247
1775
  blB += 4*RhsProgress;
1248
1776
  blA += 2*Traits::LhsProgress;
@@ -1252,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1252
1780
  ResPacket R0, R1, R2, R3;
1253
1781
  ResPacket alphav = pset1<ResPacket>(alpha);
1254
1782
 
1255
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1256
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1257
- R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1258
- R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1783
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1259
1787
  traits.acc(C0, alphav, R0);
1260
1788
  traits.acc(C4, alphav, R1);
1261
1789
  traits.acc(C1, alphav, R2);
@@ -1265,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1265
1793
  r1.storePacket(0 * Traits::ResPacketSize, R2);
1266
1794
  r1.storePacket(1 * Traits::ResPacketSize, R3);
1267
1795
 
1268
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1269
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1270
- R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1271
- R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1796
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1272
1800
  traits.acc(C2, alphav, R0);
1273
1801
  traits.acc(C6, alphav, R1);
1274
1802
  traits.acc(C3, alphav, R2);
@@ -1313,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1313
1841
  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1314
1842
  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1315
1843
  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1316
- traits.madd(A0, B_0, C0, B1); \
1317
- traits.madd(A1, B_0, C4, B_0); \
1844
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
1845
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1318
1846
  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1319
1847
  } while(false)
1320
1848
 
@@ -1327,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1327
1855
  EIGEN_GEBGP_ONESTEP(6);
1328
1856
  EIGEN_GEBGP_ONESTEP(7);
1329
1857
 
1330
- blB += pk*RhsProgress;
1331
- blA += pk*2*Traits::LhsProgress;
1858
+ blB += int(pk) * int(RhsProgress);
1859
+ blA += int(pk) * 2 * int(Traits::LhsProgress);
1332
1860
 
1333
1861
  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1334
1862
  }
@@ -1345,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1345
1873
  ResPacket R0, R1;
1346
1874
  ResPacket alphav = pset1<ResPacket>(alpha);
1347
1875
 
1348
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1349
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1876
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1350
1878
  traits.acc(C0, alphav, R0);
1351
1879
  traits.acc(C4, alphav, R1);
1352
1880
  r0.storePacket(0 * Traits::ResPacketSize, R0);
@@ -1358,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1358
1886
  //---------- Process 1 * LhsProgress rows at once ----------
1359
1887
  if(mr>=1*Traits::LhsProgress)
1360
1888
  {
1361
- // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1362
- for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1363
- {
1364
- // loops on each largest micro vertical panel of rhs (depth * nr)
1365
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1366
- {
1367
- // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1368
- // stored into 1 x nr registers.
1369
-
1370
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1371
- prefetch(&blA[0]);
1372
-
1373
- // gets res block as register
1374
- AccPacket C0, C1, C2, C3;
1375
- traits.initAcc(C0);
1376
- traits.initAcc(C1);
1377
- traits.initAcc(C2);
1378
- traits.initAcc(C3);
1379
-
1380
- LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1381
- LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1382
- LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1383
- LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1384
-
1385
- r0.prefetch(prefetch_res_offset);
1386
- r1.prefetch(prefetch_res_offset);
1387
- r2.prefetch(prefetch_res_offset);
1388
- r3.prefetch(prefetch_res_offset);
1389
-
1390
- // performs "inner" products
1391
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1392
- prefetch(&blB[0]);
1393
- LhsPacket A0;
1394
-
1395
- for(Index k=0; k<peeled_kc; k+=pk)
1396
- {
1397
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1398
- RhsPacket B_0, B1, B2, B3;
1399
-
1400
- #define EIGEN_GEBGP_ONESTEP(K) \
1401
- do { \
1402
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1403
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1404
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1405
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1406
- traits.madd(A0, B_0, C0, B_0); \
1407
- traits.madd(A0, B1, C1, B1); \
1408
- traits.madd(A0, B2, C2, B2); \
1409
- traits.madd(A0, B3, C3, B3); \
1410
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1411
- } while(false)
1412
-
1413
- internal::prefetch(blB+(48+0));
1414
- EIGEN_GEBGP_ONESTEP(0);
1415
- EIGEN_GEBGP_ONESTEP(1);
1416
- EIGEN_GEBGP_ONESTEP(2);
1417
- EIGEN_GEBGP_ONESTEP(3);
1418
- internal::prefetch(blB+(48+16));
1419
- EIGEN_GEBGP_ONESTEP(4);
1420
- EIGEN_GEBGP_ONESTEP(5);
1421
- EIGEN_GEBGP_ONESTEP(6);
1422
- EIGEN_GEBGP_ONESTEP(7);
1423
-
1424
- blB += pk*4*RhsProgress;
1425
- blA += pk*1*LhsProgress;
1426
-
1427
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1428
- }
1429
- // process remaining peeled loop
1430
- for(Index k=peeled_kc; k<depth; k++)
1431
- {
1432
- RhsPacket B_0, B1, B2, B3;
1433
- EIGEN_GEBGP_ONESTEP(0);
1434
- blB += 4*RhsProgress;
1435
- blA += 1*LhsProgress;
1436
- }
1437
- #undef EIGEN_GEBGP_ONESTEP
1438
-
1439
- ResPacket R0, R1;
1440
- ResPacket alphav = pset1<ResPacket>(alpha);
1441
-
1442
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1443
- R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1444
- traits.acc(C0, alphav, R0);
1445
- traits.acc(C1, alphav, R1);
1446
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1447
- r1.storePacket(0 * Traits::ResPacketSize, R1);
1448
-
1449
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1450
- R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1451
- traits.acc(C2, alphav, R0);
1452
- traits.acc(C3, alphav, R1);
1453
- r2.storePacket(0 * Traits::ResPacketSize, R0);
1454
- r3.storePacket(0 * Traits::ResPacketSize, R1);
1455
- }
1456
-
1457
- // Deal with remaining columns of the rhs
1458
- for(Index j2=packet_cols4; j2<cols; j2++)
1459
- {
1460
- // One column at a time
1461
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1462
- prefetch(&blA[0]);
1463
-
1464
- // gets res block as register
1465
- AccPacket C0;
1466
- traits.initAcc(C0);
1467
-
1468
- LinearMapper r0 = res.getLinearMapper(i, j2);
1469
-
1470
- // performs "inner" products
1471
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1472
- LhsPacket A0;
1473
-
1474
- for(Index k=0; k<peeled_kc; k+=pk)
1475
- {
1476
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1477
- RhsPacket B_0;
1478
-
1479
- #define EIGEN_GEBGP_ONESTEP(K) \
1480
- do { \
1481
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1482
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1483
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1484
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1485
- traits.madd(A0, B_0, C0, B_0); \
1486
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1487
- } while(false);
1488
-
1489
- EIGEN_GEBGP_ONESTEP(0);
1490
- EIGEN_GEBGP_ONESTEP(1);
1491
- EIGEN_GEBGP_ONESTEP(2);
1492
- EIGEN_GEBGP_ONESTEP(3);
1493
- EIGEN_GEBGP_ONESTEP(4);
1494
- EIGEN_GEBGP_ONESTEP(5);
1495
- EIGEN_GEBGP_ONESTEP(6);
1496
- EIGEN_GEBGP_ONESTEP(7);
1497
-
1498
- blB += pk*RhsProgress;
1499
- blA += pk*1*Traits::LhsProgress;
1500
-
1501
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1502
- }
1503
-
1504
- // process remaining peeled loop
1505
- for(Index k=peeled_kc; k<depth; k++)
1506
- {
1507
- RhsPacket B_0;
1508
- EIGEN_GEBGP_ONESTEP(0);
1509
- blB += RhsProgress;
1510
- blA += 1*Traits::LhsProgress;
1511
- }
1512
- #undef EIGEN_GEBGP_ONESTEP
1513
- ResPacket R0;
1514
- ResPacket alphav = pset1<ResPacket>(alpha);
1515
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1516
- traits.acc(C0, alphav, R0);
1517
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1518
- }
1519
- }
1889
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1891
+ }
1892
+ //---------- Process LhsProgressHalf rows at once ----------
1893
+ if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1894
+ {
1895
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1897
+ }
1898
+ //---------- Process LhsProgressQuarter rows at once ----------
1899
+ if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1900
+ {
1901
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1520
1903
  }
1521
1904
  //---------- Process remaining rows, 1 at once ----------
1522
- if(peeled_mc1<rows)
1905
+ if(peeled_mc_quarter<rows)
1523
1906
  {
1524
1907
  // loop on each panel of the rhs
1525
1908
  for(Index j2=0; j2<packet_cols4; j2+=nr)
1526
1909
  {
1527
1910
  // loop on each row of the lhs (1*LhsProgress x depth)
1528
- for(Index i=peeled_mc1; i<rows; i+=1)
1911
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
1529
1912
  {
1530
1913
  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1531
1914
  prefetch(&blA[0]);
1532
1915
  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1533
1916
 
1534
- // The following piece of code wont work for 512 bit registers
1535
- // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1536
- // as nr (which is currently 4) for the return type.
1917
+ // If LhsProgress is 8 or 16, it assumes that there is a
1918
+ // half or quarter packet, respectively, of the same size as
1919
+ // nr (which is currently 4) for the return type.
1537
1920
  const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1921
+ const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
1538
1922
  if ((SwappedTraits::LhsProgress % 4) == 0 &&
1539
- (SwappedTraits::LhsProgress <= 8) &&
1540
- (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1923
+ (SwappedTraits::LhsProgress<=16) &&
1924
+ (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925
+ (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1541
1926
  {
1542
1927
  SAccPacket C0, C1, C2, C3;
1543
1928
  straits.initAcc(C0);
@@ -1560,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1560
1945
 
1561
1946
  straits.loadRhsQuad(blA+0*spk, B_0);
1562
1947
  straits.loadRhsQuad(blA+1*spk, B_1);
1563
- straits.madd(A0,B_0,C0,B_0);
1564
- straits.madd(A1,B_1,C1,B_1);
1948
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
1949
+ straits.madd(A1,B_1,C1,B_1, fix<0>);
1565
1950
 
1566
1951
  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1567
1952
  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1568
1953
  straits.loadRhsQuad(blA+2*spk, B_0);
1569
1954
  straits.loadRhsQuad(blA+3*spk, B_1);
1570
- straits.madd(A0,B_0,C2,B_0);
1571
- straits.madd(A1,B_1,C3,B_1);
1955
+ straits.madd(A0,B_0,C2,B_0, fix<0>);
1956
+ straits.madd(A1,B_1,C3,B_1, fix<0>);
1572
1957
 
1573
1958
  blB += 4*SwappedTraits::LhsProgress;
1574
1959
  blA += 4*spk;
@@ -1581,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1581
1966
 
1582
1967
  straits.loadLhsUnaligned(blB, A0);
1583
1968
  straits.loadRhsQuad(blA, B_0);
1584
- straits.madd(A0,B_0,C0,B_0);
1969
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
1585
1970
 
1586
1971
  blB += SwappedTraits::LhsProgress;
1587
1972
  blA += spk;
@@ -1591,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1591
1976
  // Special case where we have to first reduce the accumulation register C0
1592
1977
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1593
1978
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1594
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1979
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1595
1980
  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1596
1981
 
1597
1982
  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
@@ -1604,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1604
1989
  SRhsPacketHalf b0;
1605
1990
  straits.loadLhsUnaligned(blB, a0);
1606
1991
  straits.loadRhs(blA, b0);
1607
- SAccPacketHalf c0 = predux_downto4(C0);
1608
- straits.madd(a0,b0,c0,b0);
1992
+ SAccPacketHalf c0 = predux_half_dowto4(C0);
1993
+ straits.madd(a0,b0,c0,b0, fix<0>);
1609
1994
  straits.acc(c0, alphav, R);
1610
1995
  }
1611
1996
  else
1612
1997
  {
1613
- straits.acc(predux_downto4(C0), alphav, R);
1998
+ straits.acc(predux_half_dowto4(C0), alphav, R);
1614
1999
  }
1615
2000
  res.scatterPacket(i, j2, R);
1616
2001
  }
2002
+ else if (SwappedTraits::LhsProgress==16)
2003
+ {
2004
+ // Special case where we have to first reduce the
2005
+ // accumulation register C0. We specialize the block in
2006
+ // template form, so that LhsProgress < 16 paths don't
2007
+ // fail to compile
2008
+ last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2009
+ p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2010
+ }
1617
2011
  else
1618
2012
  {
1619
2013
  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
@@ -1636,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1636
2030
 
1637
2031
  B_0 = blB[0];
1638
2032
  B_1 = blB[1];
1639
- CJMADD(cj,A0,B_0,C0, B_0);
1640
- CJMADD(cj,A0,B_1,C1, B_1);
1641
-
2033
+ C0 = cj.pmadd(A0,B_0,C0);
2034
+ C1 = cj.pmadd(A0,B_1,C1);
2035
+
1642
2036
  B_0 = blB[2];
1643
2037
  B_1 = blB[3];
1644
- CJMADD(cj,A0,B_0,C2, B_0);
1645
- CJMADD(cj,A0,B_1,C3, B_1);
1646
-
2038
+ C2 = cj.pmadd(A0,B_0,C2);
2039
+ C3 = cj.pmadd(A0,B_1,C3);
2040
+
1647
2041
  blB += 4;
1648
2042
  }
1649
2043
  res(i, j2 + 0) += alpha * C0;
@@ -1657,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1657
2051
  for(Index j2=packet_cols4; j2<cols; j2++)
1658
2052
  {
1659
2053
  // loop on each row of the lhs (1*LhsProgress x depth)
1660
- for(Index i=peeled_mc1; i<rows; i+=1)
2054
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
1661
2055
  {
1662
2056
  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1663
2057
  prefetch(&blA[0]);
@@ -1668,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1668
2062
  {
1669
2063
  LhsScalar A0 = blA[k];
1670
2064
  RhsScalar B_0 = blB[k];
1671
- CJMADD(cj, A0, B_0, C0, B_0);
2065
+ C0 = cj.pmadd(A0, B_0, C0);
1672
2066
  }
1673
2067
  res(i, j2) += alpha * C0;
1674
2068
  }
@@ -1677,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1677
2071
  }
1678
2072
 
1679
2073
 
1680
- #undef CJMADD
1681
-
1682
2074
  // pack a block of the lhs
1683
2075
  // The traversal is as follow (mr==4):
1684
2076
  // 0 4 8 12 ...
@@ -1693,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1693
2085
  //
1694
2086
  // 32 33 34 35 ...
1695
2087
  // 36 36 38 39 ...
1696
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1697
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
2088
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2089
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
1698
2090
  {
1699
2091
  typedef typename DataMapper::LinearMapper LinearMapper;
1700
2092
  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1701
2093
  };
1702
2094
 
1703
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1704
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
2095
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2096
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
1705
2097
  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1706
2098
  {
1707
- typedef typename packet_traits<Scalar>::type Packet;
1708
- enum { PacketSize = packet_traits<Scalar>::size };
2099
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2100
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101
+ enum { PacketSize = unpacket_traits<Packet>::size,
2102
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2105
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
1709
2106
 
1710
2107
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1711
2108
  EIGEN_UNUSED_VARIABLE(stride);
@@ -1717,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1717
2114
 
1718
2115
  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1719
2116
  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1720
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1721
- const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1722
- : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
2117
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120
+ const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121
+ const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122
+ : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
1723
2123
 
1724
2124
  Index i=0;
1725
2125
 
@@ -1733,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1733
2133
  for(Index k=0; k<depth; k++)
1734
2134
  {
1735
2135
  Packet A, B, C;
1736
- A = lhs.loadPacket(i+0*PacketSize, k);
1737
- B = lhs.loadPacket(i+1*PacketSize, k);
1738
- C = lhs.loadPacket(i+2*PacketSize, k);
2136
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138
+ C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
1739
2139
  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1740
2140
  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1741
2141
  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1753,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1753
2153
  for(Index k=0; k<depth; k++)
1754
2154
  {
1755
2155
  Packet A, B;
1756
- A = lhs.loadPacket(i+0*PacketSize, k);
1757
- B = lhs.loadPacket(i+1*PacketSize, k);
2156
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157
+ B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
1758
2158
  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1759
2159
  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1760
2160
  }
@@ -1771,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1771
2171
  for(Index k=0; k<depth; k++)
1772
2172
  {
1773
2173
  Packet A;
1774
- A = lhs.loadPacket(i+0*PacketSize, k);
2174
+ A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
1775
2175
  pstore(blockA+count, cj.pconj(A));
1776
2176
  count+=PacketSize;
1777
2177
  }
1778
2178
  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1779
2179
  }
1780
2180
  }
1781
- // Pack scalars
2181
+ // Pack half packets
2182
+ if(HasHalf && Pack1>=HalfPacketSize)
2183
+ {
2184
+ for(; i<peeled_mc_half; i+=HalfPacketSize)
2185
+ {
2186
+ if(PanelMode) count += (HalfPacketSize) * offset;
2187
+
2188
+ for(Index k=0; k<depth; k++)
2189
+ {
2190
+ HalfPacket A;
2191
+ A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192
+ pstoreu(blockA+count, cj.pconj(A));
2193
+ count+=HalfPacketSize;
2194
+ }
2195
+ if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2196
+ }
2197
+ }
2198
+ // Pack quarter packets
2199
+ if(HasQuarter && Pack1>=QuarterPacketSize)
2200
+ {
2201
+ for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2202
+ {
2203
+ if(PanelMode) count += (QuarterPacketSize) * offset;
2204
+
2205
+ for(Index k=0; k<depth; k++)
2206
+ {
2207
+ QuarterPacket A;
2208
+ A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209
+ pstoreu(blockA+count, cj.pconj(A));
2210
+ count+=QuarterPacketSize;
2211
+ }
2212
+ if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2213
+ }
2214
+ }
2215
+ // Pack2 may be *smaller* than PacketSize—that happens for
2216
+ // products like real * complex, where we have to go half the
2217
+ // progress on the lhs in order to duplicate those operands to
2218
+ // address both real & imaginary parts on the rhs. This portion will
2219
+ // pack those half ones until they match the number expected on the
2220
+ // last peeling loop at this point (for the rhs).
1782
2221
  if(Pack2<PacketSize && Pack2>1)
1783
2222
  {
1784
- for(; i<peeled_mc0; i+=Pack2)
2223
+ for(; i<peeled_mc0; i+=last_lhs_progress)
1785
2224
  {
1786
- if(PanelMode) count += Pack2 * offset;
2225
+ if(PanelMode) count += last_lhs_progress * offset;
1787
2226
 
1788
2227
  for(Index k=0; k<depth; k++)
1789
- for(Index w=0; w<Pack2; w++)
2228
+ for(Index w=0; w<last_lhs_progress; w++)
1790
2229
  blockA[count++] = cj(lhs(i+w, k));
1791
2230
 
1792
- if(PanelMode) count += Pack2 * (stride-offset-depth);
2231
+ if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
1793
2232
  }
1794
2233
  }
2234
+ // Pack scalars
1795
2235
  for(; i<rows; i++)
1796
2236
  {
1797
2237
  if(PanelMode) count += offset;
@@ -1801,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
1801
2241
  }
1802
2242
  }
1803
2243
 
1804
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1805
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
2244
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2245
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
1806
2246
  {
1807
2247
  typedef typename DataMapper::LinearMapper LinearMapper;
1808
2248
  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1809
2249
  };
1810
2250
 
1811
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1812
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
2251
+ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2252
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
1813
2253
  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1814
2254
  {
1815
- typedef typename packet_traits<Scalar>::type Packet;
1816
- enum { PacketSize = packet_traits<Scalar>::size };
2255
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2256
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257
+ enum { PacketSize = unpacket_traits<Packet>::size,
2258
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2261
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
1817
2262
 
1818
2263
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1819
2264
  EIGEN_UNUSED_VARIABLE(stride);
@@ -1821,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1821
2266
  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1822
2267
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1823
2268
  Index count = 0;
2269
+ bool gone_half = false, gone_quarter = false, gone_last = false;
1824
2270
 
1825
- // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1826
- // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1827
- // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1828
-
1829
- int pack = Pack1;
1830
2271
  Index i = 0;
2272
+ int pack = Pack1;
2273
+ int psize = PacketSize;
1831
2274
  while(pack>0)
1832
2275
  {
1833
2276
  Index remaining_rows = rows-i;
1834
- Index peeled_mc = i+(remaining_rows/pack)*pack;
2277
+ Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278
+ Index starting_pos = i;
1835
2279
  for(; i<peeled_mc; i+=pack)
1836
2280
  {
1837
2281
  if(PanelMode) count += pack * offset;
1838
2282
 
1839
- const Index peeled_k = (depth/PacketSize)*PacketSize;
1840
2283
  Index k=0;
1841
- if(pack>=PacketSize)
2284
+ if(pack>=psize && psize >= QuarterPacketSize)
1842
2285
  {
1843
- for(; k<peeled_k; k+=PacketSize)
2286
+ const Index peeled_k = (depth/psize)*psize;
2287
+ for(; k<peeled_k; k+=psize)
1844
2288
  {
1845
- for (Index m = 0; m < pack; m += PacketSize)
2289
+ for (Index m = 0; m < pack; m += psize)
1846
2290
  {
1847
- PacketBlock<Packet> kernel;
1848
- for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1849
- ptranspose(kernel);
1850
- for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2291
+ if (psize == PacketSize) {
2292
+ PacketBlock<Packet> kernel;
2293
+ for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2294
+ ptranspose(kernel);
2295
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296
+ } else if (HasHalf && psize == HalfPacketSize) {
2297
+ gone_half = true;
2298
+ PacketBlock<HalfPacket> kernel_half;
2299
+ for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2300
+ ptranspose(kernel_half);
2301
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302
+ } else if (HasQuarter && psize == QuarterPacketSize) {
2303
+ gone_quarter = true;
2304
+ PacketBlock<QuarterPacket> kernel_quarter;
2305
+ for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2306
+ ptranspose(kernel_quarter);
2307
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2308
+ }
1851
2309
  }
1852
- count += PacketSize*pack;
2310
+ count += psize*pack;
1853
2311
  }
1854
2312
  }
2313
+
1855
2314
  for(; k<depth; k++)
1856
2315
  {
1857
2316
  Index w=0;
@@ -1874,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1874
2333
  if(PanelMode) count += pack * (stride-offset-depth);
1875
2334
  }
1876
2335
 
1877
- pack -= PacketSize;
1878
- if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1879
- pack = Pack2;
2336
+ pack -= psize;
2337
+ Index left = rows - i;
2338
+ if (pack <= 0) {
2339
+ if (!gone_last &&
2340
+ (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341
+ ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342
+ (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2343
+ psize /= 2;
2344
+ pack = psize;
2345
+ continue;
2346
+ }
2347
+ // Pack2 may be *smaller* than PacketSize—that happens for
2348
+ // products like real * complex, where we have to go half the
2349
+ // progress on the lhs in order to duplicate those operands to
2350
+ // address both real & imaginary parts on the rhs. This portion will
2351
+ // pack those half ones until they match the number expected on the
2352
+ // last peeling loop at this point (for the rhs).
2353
+ if (Pack2 < PacketSize && !gone_last) {
2354
+ gone_last = true;
2355
+ psize = pack = left & ~1;
2356
+ }
2357
+ }
1880
2358
  }
1881
2359
 
1882
2360
  for(; i<rows; i++)
@@ -1932,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
1932
2410
  // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1933
2411
  // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1934
2412
  // Index k=0;
1935
- // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
2413
+ // if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
1936
2414
  // {
1937
2415
  // for(; k<peeled_k; k+=PacketSize) {
1938
2416
  // PacketBlock<Packet> kernel;
@@ -1979,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
1979
2457
  {
1980
2458
  for(; k<peeled_k; k+=PacketSize) {
1981
2459
  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1982
- kernel.packet[0] = dm0.loadPacket(k);
1983
- kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1984
- kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1985
- kernel.packet[3%PacketSize] = dm3.loadPacket(k);
2460
+ kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461
+ kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462
+ kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463
+ kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
1986
2464
  ptranspose(kernel);
1987
2465
  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1988
2466
  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@@ -2023,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
2023
2501
  struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2024
2502
  {
2025
2503
  typedef typename packet_traits<Scalar>::type Packet;
2504
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2505
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2026
2506
  typedef typename DataMapper::LinearMapper LinearMapper;
2027
- enum { PacketSize = packet_traits<Scalar>::size };
2028
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2029
- };
2030
-
2031
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2032
- EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2033
- ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2034
- {
2035
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2036
- EIGEN_UNUSED_VARIABLE(stride);
2037
- EIGEN_UNUSED_VARIABLE(offset);
2038
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2039
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2040
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2041
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2042
- Index count = 0;
2043
-
2044
- // if(nr>=8)
2045
- // {
2046
- // for(Index j2=0; j2<packet_cols8; j2+=8)
2047
- // {
2048
- // // skip what we have before
2049
- // if(PanelMode) count += 8 * offset;
2050
- // for(Index k=0; k<depth; k++)
2051
- // {
2052
- // if (PacketSize==8) {
2053
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2054
- // pstoreu(blockB+count, cj.pconj(A));
2055
- // } else if (PacketSize==4) {
2056
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2057
- // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2058
- // pstoreu(blockB+count, cj.pconj(A));
2059
- // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2060
- // } else {
2061
- // const Scalar* b0 = &rhs[k*rhsStride + j2];
2062
- // blockB[count+0] = cj(b0[0]);
2063
- // blockB[count+1] = cj(b0[1]);
2064
- // blockB[count+2] = cj(b0[2]);
2065
- // blockB[count+3] = cj(b0[3]);
2066
- // blockB[count+4] = cj(b0[4]);
2067
- // blockB[count+5] = cj(b0[5]);
2068
- // blockB[count+6] = cj(b0[6]);
2069
- // blockB[count+7] = cj(b0[7]);
2070
- // }
2071
- // count += 8;
2072
- // }
2073
- // // skip what we have after
2074
- // if(PanelMode) count += 8 * (stride-offset-depth);
2075
- // }
2076
- // }
2077
- if(nr>=4)
2507
+ enum { PacketSize = packet_traits<Scalar>::size,
2508
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2509
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
2510
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2078
2511
  {
2079
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2512
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2513
+ EIGEN_UNUSED_VARIABLE(stride);
2514
+ EIGEN_UNUSED_VARIABLE(offset);
2515
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516
+ const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2517
+ const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2518
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2519
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2521
+ Index count = 0;
2522
+
2523
+ // if(nr>=8)
2524
+ // {
2525
+ // for(Index j2=0; j2<packet_cols8; j2+=8)
2526
+ // {
2527
+ // // skip what we have before
2528
+ // if(PanelMode) count += 8 * offset;
2529
+ // for(Index k=0; k<depth; k++)
2530
+ // {
2531
+ // if (PacketSize==8) {
2532
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2533
+ // pstoreu(blockB+count, cj.pconj(A));
2534
+ // } else if (PacketSize==4) {
2535
+ // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2536
+ // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2537
+ // pstoreu(blockB+count, cj.pconj(A));
2538
+ // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2539
+ // } else {
2540
+ // const Scalar* b0 = &rhs[k*rhsStride + j2];
2541
+ // blockB[count+0] = cj(b0[0]);
2542
+ // blockB[count+1] = cj(b0[1]);
2543
+ // blockB[count+2] = cj(b0[2]);
2544
+ // blockB[count+3] = cj(b0[3]);
2545
+ // blockB[count+4] = cj(b0[4]);
2546
+ // blockB[count+5] = cj(b0[5]);
2547
+ // blockB[count+6] = cj(b0[6]);
2548
+ // blockB[count+7] = cj(b0[7]);
2549
+ // }
2550
+ // count += 8;
2551
+ // }
2552
+ // // skip what we have after
2553
+ // if(PanelMode) count += 8 * (stride-offset-depth);
2554
+ // }
2555
+ // }
2556
+ if(nr>=4)
2080
2557
  {
2081
- // skip what we have before
2082
- if(PanelMode) count += 4 * offset;
2083
- for(Index k=0; k<depth; k++)
2558
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2084
2559
  {
2085
- if (PacketSize==4) {
2086
- Packet A = rhs.loadPacket(k, j2);
2087
- pstoreu(blockB+count, cj.pconj(A));
2088
- count += PacketSize;
2089
- } else {
2090
- const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2091
- blockB[count+0] = cj(dm0(0));
2092
- blockB[count+1] = cj(dm0(1));
2093
- blockB[count+2] = cj(dm0(2));
2094
- blockB[count+3] = cj(dm0(3));
2095
- count += 4;
2560
+ // skip what we have before
2561
+ if(PanelMode) count += 4 * offset;
2562
+ for(Index k=0; k<depth; k++)
2563
+ {
2564
+ if (PacketSize==4) {
2565
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
2566
+ pstoreu(blockB+count, cj.pconj(A));
2567
+ count += PacketSize;
2568
+ } else if (HasHalf && HalfPacketSize==4) {
2569
+ HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570
+ pstoreu(blockB+count, cj.pconj(A));
2571
+ count += HalfPacketSize;
2572
+ } else if (HasQuarter && QuarterPacketSize==4) {
2573
+ QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574
+ pstoreu(blockB+count, cj.pconj(A));
2575
+ count += QuarterPacketSize;
2576
+ } else {
2577
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578
+ blockB[count+0] = cj(dm0(0));
2579
+ blockB[count+1] = cj(dm0(1));
2580
+ blockB[count+2] = cj(dm0(2));
2581
+ blockB[count+3] = cj(dm0(3));
2582
+ count += 4;
2583
+ }
2096
2584
  }
2585
+ // skip what we have after
2586
+ if(PanelMode) count += 4 * (stride-offset-depth);
2097
2587
  }
2098
- // skip what we have after
2099
- if(PanelMode) count += 4 * (stride-offset-depth);
2100
2588
  }
2101
- }
2102
- // copy the remaining columns one at a time (nr==1)
2103
- for(Index j2=packet_cols4; j2<cols; ++j2)
2104
- {
2105
- if(PanelMode) count += offset;
2106
- for(Index k=0; k<depth; k++)
2589
+ // copy the remaining columns one at a time (nr==1)
2590
+ for(Index j2=packet_cols4; j2<cols; ++j2)
2107
2591
  {
2108
- blockB[count] = cj(rhs(k, j2));
2109
- count += 1;
2592
+ if(PanelMode) count += offset;
2593
+ for(Index k=0; k<depth; k++)
2594
+ {
2595
+ blockB[count] = cj(rhs(k, j2));
2596
+ count += 1;
2597
+ }
2598
+ if(PanelMode) count += stride-offset-depth;
2110
2599
  }
2111
- if(PanelMode) count += stride-offset-depth;
2112
2600
  }
2113
- }
2601
+ };
2114
2602
 
2115
2603
  } // end namespace internal
2116
2604