@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -10,34 +10,66 @@
10
10
  #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11
11
  #define EIGEN_GENERAL_BLOCK_PANEL_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../InternalHeaderCheck.h"
13
15
 
14
16
  namespace Eigen {
15
17
 
16
18
  namespace internal {
17
19
 
18
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
19
- class gebp_traits;
20
+ enum GEBPPacketSizeType { GEBPPacketFull = 0, GEBPPacketHalf, GEBPPacketQuarter };
20
21
 
22
+ template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_ = false, bool ConjRhs_ = false,
23
+ int Arch = Architecture::Target, int PacketSize_ = GEBPPacketFull>
24
+ class gebp_traits;
21
25
 
22
26
  /** \internal \returns b if a<=0, and returns a otherwise. */
23
- inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
24
- {
25
- return a<=0 ? b : a;
26
- }
27
+ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { return a <= 0 ? b : a; }
28
+
29
+ #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
30
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
31
+ #else
32
+ #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
33
+ #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
34
+
35
+ #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
36
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
37
+ #else
38
+ #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
39
+ #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
40
+
41
+ #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
42
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
43
+ #else
44
+ #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
45
+ #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
27
46
 
28
47
  #if EIGEN_ARCH_i386_OR_x86_64
29
- const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30
- const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31
- const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
48
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32 * 1024);
49
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256 * 1024);
50
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2 * 1024 * 1024);
51
+ #elif EIGEN_ARCH_PPC
52
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
53
+ #ifdef _ARCH_PWR10
54
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2 * 1024 * 1024);
55
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 1024 * 1024);
56
+ #else
57
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
58
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
59
+ #endif
32
60
  #else
33
- const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34
- const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35
- const std::ptrdiff_t defaultL3CacheSize = 512*1024;
61
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
62
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
63
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 * 1024);
36
64
  #endif
37
65
 
66
+ #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67
+ #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68
+ #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69
+
38
70
  /** \internal */
39
71
  struct CacheSizes {
40
- CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
72
+ CacheSizes() : m_l1(-1), m_l2(-1), m_l3(-1) {
41
73
  int l1CacheSize, l2CacheSize, l3CacheSize;
42
74
  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
43
75
  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
@@ -50,29 +82,22 @@ struct CacheSizes {
50
82
  std::ptrdiff_t m_l3;
51
83
  };
52
84
 
53
-
54
85
  /** \internal */
55
- inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56
- {
86
+ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) {
57
87
  static CacheSizes m_cacheSizes;
58
88
 
59
- if(action==SetAction)
60
- {
89
+ if (action == SetAction) {
61
90
  // set the cpu cache size and cache all block sizes from a global cache size in byte
62
- eigen_internal_assert(l1!=0 && l2!=0);
91
+ eigen_internal_assert(l1 != 0 && l2 != 0);
63
92
  m_cacheSizes.m_l1 = *l1;
64
93
  m_cacheSizes.m_l2 = *l2;
65
94
  m_cacheSizes.m_l3 = *l3;
66
- }
67
- else if(action==GetAction)
68
- {
69
- eigen_internal_assert(l1!=0 && l2!=0);
95
+ } else if (action == GetAction) {
96
+ eigen_internal_assert(l1 != 0 && l2 != 0);
70
97
  *l1 = m_cacheSizes.m_l1;
71
98
  *l2 = m_cacheSizes.m_l2;
72
99
  *l3 = m_cacheSizes.m_l3;
73
- }
74
- else
75
- {
100
+ } else {
76
101
  eigen_internal_assert(false);
77
102
  }
78
103
  }
@@ -89,10 +114,9 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
89
114
  *
90
115
  * \sa setCpuCacheSizes */
91
116
 
92
- template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
93
- void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
94
- {
95
- typedef gebp_traits<LhsScalar,RhsScalar> Traits;
117
+ template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
118
+ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) {
119
+ typedef gebp_traits<LhsScalar, RhsScalar> Traits;
96
120
 
97
121
  // Explanations:
98
122
  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
@@ -101,12 +125,22 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
101
125
  // at the register level. This small horizontal panel has to stay within L1 cache.
102
126
  std::ptrdiff_t l1, l2, l3;
103
127
  manage_caching_sizes(GetAction, &l1, &l2, &l3);
128
+ #ifdef EIGEN_VECTORIZE_AVX512
129
+ // We need to find a rationale for that, but without this adjustment,
130
+ // performance with AVX512 is pretty bad, like -20% slower.
131
+ // One reason is that with increasing packet-size, the blocking size k
132
+ // has to become pretty small if we want that 1 lhs panel fit within L1.
133
+ // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
134
+ // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
135
+ // This is quite small for a good reuse of the accumulation registers.
136
+ l1 *= 4;
137
+ #endif
104
138
 
105
139
  if (num_threads > 1) {
106
140
  typedef typename Traits::ResScalar ResScalar;
107
141
  enum {
108
142
  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
109
- ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
143
+ ksub = Traits::mr * (Traits::nr * sizeof(ResScalar)),
110
144
  kr = 8,
111
145
  mr = Traits::mr,
112
146
  nr = Traits::nr
@@ -116,13 +150,13 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
116
150
  // increasing the value of k, so we'll cap it at 320 (value determined
117
151
  // experimentally).
118
152
  // To avoid that k vanishes, we make k_cache at least as big as kr
119
- const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
153
+ const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
120
154
  if (k_cache < k) {
121
155
  k = k_cache - (k_cache % kr);
122
156
  eigen_internal_assert(k > 0);
123
157
  }
124
158
 
125
- const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
159
+ const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
126
160
  const Index n_per_thread = numext::div_ceil(n, num_threads);
127
161
  if (n_cache <= n_per_thread) {
128
162
  // Don't exceed the capacity of the l2 cache.
@@ -135,37 +169,35 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
135
169
 
136
170
  if (l3 > l2) {
137
171
  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
138
- const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
172
+ const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
139
173
  const Index m_per_thread = numext::div_ceil(m, num_threads);
140
- if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
174
+ if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
141
175
  m = m_cache - (m_cache % mr);
142
176
  eigen_internal_assert(m > 0);
143
177
  } else {
144
178
  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
145
179
  }
146
180
  }
147
- }
148
- else {
181
+ } else {
149
182
  // In unit tests we do not want to use extra large matrices,
150
183
  // so we reduce the cache size to check the blocking strategy is not flawed
151
184
  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
152
- l1 = 9*1024;
153
- l2 = 32*1024;
154
- l3 = 512*1024;
185
+ l1 = 9 * 1024;
186
+ l2 = 32 * 1024;
187
+ l3 = 512 * 1024;
155
188
  #endif
156
189
 
157
190
  // Early return for small problems because the computation below are time consuming for small problems.
158
191
  // Perhaps it would make more sense to consider k*n*m??
159
192
  // Note that for very tiny problem, this function should be bypassed anyway
160
193
  // because we use the coefficient-based implementation for them.
161
- if((numext::maxi)(k,(numext::maxi)(m,n))<48)
162
- return;
194
+ if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
163
195
 
164
196
  typedef typename Traits::ResScalar ResScalar;
165
197
  enum {
166
198
  k_peeling = 8,
167
199
  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
168
- k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
200
+ k_sub = Traits::mr * (Traits::nr * sizeof(ResScalar))
169
201
  };
170
202
 
171
203
  // ---- 1st level of blocking on L1, yields kc ----
@@ -175,30 +207,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
175
207
  // We also include a register-level block of the result (mx x nr).
176
208
  // (In an ideal world only the lhs panel would stay in L1)
177
209
  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
178
- const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
210
+ const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
179
211
  const Index old_k = k;
180
- if(k>max_kc)
181
- {
212
+ if (k > max_kc) {
182
213
  // We are really blocking on the third dimension:
183
214
  // -> reduce blocking size to make sure the last block is as large as possible
184
215
  // while keeping the same number of sweeps over the result.
185
- k = (k%max_kc)==0 ? max_kc
186
- : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
216
+ k = (k % max_kc) == 0 ? max_kc
217
+ : max_kc - k_peeling * ((max_kc - 1 - (k % max_kc)) / (k_peeling * (k / max_kc + 1)));
187
218
 
188
- eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
219
+ eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
189
220
  }
190
221
 
191
- // ---- 2nd level of blocking on max(L2,L3), yields nc ----
222
+ // ---- 2nd level of blocking on max(L2,L3), yields nc ----
192
223
 
193
- // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
194
- // actual_l2 = max(l2, l3/nb_core_sharing_l3)
195
- // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
196
- // For instance, it corresponds to 6MB of L3 shared among 4 cores.
197
- #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
224
+ // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
225
+ // actual_l2 = max(l2, l3/nb_core_sharing_l3)
226
+ // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
227
+ // For instance, it corresponds to 6MB of L3 shared among 4 cores.
228
+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
198
229
  const Index actual_l2 = l3;
199
- #else
200
- const Index actual_l2 = 1572864; // == 1.5 MB
201
- #endif
230
+ #else
231
+ const Index actual_l2 = 1572864; // == 1.5 MB
232
+ #endif
202
233
 
203
234
  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
204
235
  // The second half is implicitly reserved to access the result and lhs coefficients.
@@ -208,61 +239,52 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
208
239
  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
209
240
  Index max_nc;
210
241
  const Index lhs_bytes = m * k * sizeof(LhsScalar);
211
- const Index remaining_l1 = l1- k_sub - lhs_bytes;
212
- if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
213
- {
242
+ const Index remaining_l1 = l1 - k_sub - lhs_bytes;
243
+ if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
214
244
  // L1 blocking
215
- max_nc = remaining_l1 / (k*sizeof(RhsScalar));
216
- }
217
- else
218
- {
245
+ max_nc = remaining_l1 / (k * sizeof(RhsScalar));
246
+ } else {
219
247
  // L2 blocking
220
- max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
248
+ max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
221
249
  }
222
250
  // WARNING Below, we assume that Traits::nr is a power of two.
223
- Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
224
- if(n>nc)
225
- {
251
+ Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
252
+ if (n > nc) {
226
253
  // We are really blocking over the columns:
227
254
  // -> reduce blocking size to make sure the last block is as large as possible
228
255
  // while keeping the same number of sweeps over the packed lhs.
229
256
  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
230
- n = (n%nc)==0 ? nc
231
- : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
232
- }
233
- else if(old_k==k)
234
- {
257
+ n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
258
+ } else if (old_k == k) {
235
259
  // So far, no blocking at all, i.e., kc==k, and nc==n.
236
260
  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
237
- // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
238
- Index problem_size = k*n*sizeof(LhsScalar);
261
+ // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
262
+ // here should be obsolete.
263
+ Index problem_size = k * n * sizeof(LhsScalar);
239
264
  Index actual_lm = actual_l2;
240
265
  Index max_mc = m;
241
- if(problem_size<=1024)
242
- {
266
+ if (problem_size <= 1024) {
243
267
  // problem is small enough to keep in L1
244
268
  // Let's choose m such that lhs's block fit in 1/3 of L1
245
269
  actual_lm = l1;
246
- }
247
- else if(l3!=0 && problem_size<=32768)
248
- {
270
+ } else if (l3 != 0 && problem_size <= 32768) {
249
271
  // we have both L2 and L3, and problem is small enough to be kept in L2
250
272
  // Let's choose m such that lhs's block fit in 1/3 of L2
251
273
  actual_lm = l2;
252
- max_mc = (numext::mini<Index>)(576,max_mc);
274
+ max_mc = (numext::mini<Index>)(576, max_mc);
253
275
  }
254
- Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
255
- if (mc > Traits::mr) mc -= mc % Traits::mr;
256
- else if (mc==0) return;
257
- m = (m%mc)==0 ? mc
258
- : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
276
+ Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
277
+ if (mc > Traits::mr)
278
+ mc -= mc % Traits::mr;
279
+ else if (mc == 0)
280
+ return;
281
+ m = (m % mc) == 0 ? mc : (mc - Traits::mr * ((mc /*-1*/ - (m % mc)) / (Traits::mr * (m / mc + 1))));
259
282
  }
260
283
  }
261
284
  }
262
285
 
263
286
  template <typename Index>
264
- inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
265
- {
287
+ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) {
266
288
  #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
267
289
  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
268
290
  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
@@ -279,314 +301,393 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
279
301
  }
280
302
 
281
303
  /** \brief Computes the blocking parameters for a m x k times k x n matrix product
282
- *
283
- * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
284
- * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
285
- * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
286
- *
287
- * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
288
- * this function computes the blocking size parameters along the respective dimensions
289
- * for matrix products and related algorithms.
290
- *
291
- * The blocking size parameters may be evaluated:
292
- * - either by a heuristic based on cache sizes;
293
- * - or using fixed prescribed values (for testing purposes).
294
- *
295
- * \sa setCpuCacheSizes */
296
-
297
- template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
298
- void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
299
- {
304
+ *
305
+ * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
306
+ * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
307
+ * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same
308
+ * dimension.
309
+ * \param[in] num_threads Input: the number of threads used for the computation.
310
+ *
311
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
312
+ * this function computes the blocking size parameters along the respective dimensions
313
+ * for matrix products and related algorithms.
314
+ *
315
+ * The blocking size parameters may be evaluated:
316
+ * - either by a heuristic based on cache sizes;
317
+ * - or using fixed prescribed values (for testing purposes).
318
+ *
319
+ * \sa setCpuCacheSizes */
320
+
321
+ template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
322
+ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
300
323
  if (!useSpecificBlockingSizes(k, m, n)) {
301
324
  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
302
325
  }
303
326
  }
304
327
 
305
- template<typename LhsScalar, typename RhsScalar, typename Index>
306
- inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
307
- {
308
- computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
328
+ template <typename LhsScalar, typename RhsScalar, typename Index>
329
+ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
330
+ computeProductBlockingSizes<LhsScalar, RhsScalar, 1, Index>(k, m, n, num_threads);
309
331
  }
310
332
 
311
- #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
312
- #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
313
- #else
333
+ template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
334
+ struct RhsPanelHelper {
335
+ private:
336
+ static constexpr int remaining_registers =
337
+ (std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0);
314
338
 
315
- // FIXME (a bit overkill maybe ?)
339
+ public:
340
+ typedef std::conditional_t<remaining_registers >= 4, RhsPacketx4, RhsPacket> type;
341
+ };
316
342
 
317
- template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
318
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
319
- {
320
- c = cj.pmadd(a,b,c);
321
- }
322
- };
343
+ template <typename Packet>
344
+ struct QuadPacket {
345
+ Packet B_0, B1, B2, B3;
346
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
347
+ const Packet& get(const FixedInt<1>&) const { return B1; }
348
+ const Packet& get(const FixedInt<2>&) const { return B2; }
349
+ const Packet& get(const FixedInt<3>&) const { return B3; }
350
+ };
323
351
 
324
- template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
325
- EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
326
- {
327
- t = b; t = cj.pmul(a,t); c = padd(c,t);
328
- }
329
- };
352
+ template <int N, typename T1, typename T2, typename T3>
353
+ struct packet_conditional {
354
+ typedef T3 type;
355
+ };
330
356
 
331
- template<typename CJ, typename A, typename B, typename C, typename T>
332
- EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
333
- {
334
- gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
335
- }
357
+ template <typename T1, typename T2, typename T3>
358
+ struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
359
+ typedef T1 type;
360
+ };
336
361
 
337
- #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
338
- // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
339
- #endif
362
+ template <typename T1, typename T2, typename T3>
363
+ struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
364
+ typedef T2 type;
365
+ };
366
+
367
+ #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
368
+ typedef typename packet_conditional< \
369
+ packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
370
+ typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
371
+
372
+ #define PACKET_DECL_COND(name, packet_size) \
373
+ typedef typename packet_conditional< \
374
+ packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
375
+ typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet
376
+
377
+ #define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
378
+ typedef typename packet_conditional< \
379
+ packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
380
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket##postfix
381
+
382
+ #define PACKET_DECL_COND_SCALAR(packet_size) \
383
+ typedef typename packet_conditional< \
384
+ packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
385
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket
340
386
 
341
387
  /* Vectorization logic
342
388
  * real*real: unpack rhs to constant packets, ...
343
- *
389
+ *
344
390
  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
345
391
  * storing each res packet into two packets (2x2),
346
- * at the end combine them: swap the second and addsub them
392
+ * at the end combine them: swap the second and addsub them
347
393
  * cf*cf : same but with 2x4 blocks
348
394
  * cplx*real : unpack rhs to constant packets, ...
349
395
  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
350
396
  */
351
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
352
- class gebp_traits
353
- {
354
- public:
355
- typedef _LhsScalar LhsScalar;
356
- typedef _RhsScalar RhsScalar;
397
+ template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
398
+ class gebp_traits {
399
+ public:
400
+ typedef LhsScalar_ LhsScalar;
401
+ typedef RhsScalar_ RhsScalar;
357
402
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
358
403
 
404
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
405
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
406
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
407
+
359
408
  enum {
360
- ConjLhs = _ConjLhs,
361
- ConjRhs = _ConjRhs,
362
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
363
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
364
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
365
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
366
-
409
+ ConjLhs = ConjLhs_,
410
+ ConjRhs = ConjRhs_,
411
+ Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
412
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
413
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
414
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
415
+
367
416
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
368
417
 
369
418
  // register block size along the N direction must be 1 or 4
370
419
  nr = 4,
371
420
 
372
421
  // register block size along the M direction (currently, this one cannot be modified)
373
- default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
374
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
375
- // we assume 16 registers
422
+ default_mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
423
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && \
424
+ !defined(EIGEN_VECTORIZE_VSX) && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC >= 1914))
425
+ // we assume 16 registers or more
376
426
  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
377
427
  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
378
- mr = Vectorizable ? 3*LhsPacketSize : default_mr,
428
+ // Bug 1515: MSVC prior to v19.14 yields to register spilling.
429
+ mr = Vectorizable ? 3 * LhsPacketSize : default_mr,
379
430
  #else
380
431
  mr = default_mr,
381
432
  #endif
382
-
433
+
383
434
  LhsProgress = LhsPacketSize,
384
435
  RhsProgress = 1
385
436
  };
386
437
 
387
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
388
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
389
- typedef typename packet_traits<ResScalar>::type _ResPacket;
390
-
391
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
392
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
393
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
438
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
439
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
440
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
441
+ typedef LhsPacket LhsPacket4Packing;
394
442
 
443
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
395
444
  typedef ResPacket AccPacket;
396
-
397
- EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
398
- {
399
- p = pset1<ResPacket>(ResScalar(0));
400
- }
401
-
402
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
403
- {
404
- pbroadcast4(b, b0, b1, b2, b3);
405
- }
406
-
407
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
408
- // {
409
- // pbroadcast2(b, b0, b1);
410
- // }
411
-
412
- template<typename RhsPacketType>
413
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
414
- {
445
+
446
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
447
+
448
+ template <typename RhsPacketType>
449
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
415
450
  dest = pset1<RhsPacketType>(*b);
416
451
  }
417
-
418
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
419
- {
420
- dest = ploadquad<RhsPacket>(b);
452
+
453
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
454
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
455
+ }
456
+
457
+ template <typename RhsPacketType>
458
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
459
+ loadRhs(b, dest);
421
460
  }
422
461
 
423
- template<typename LhsPacketType>
424
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
425
- {
462
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
463
+
464
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
465
+
466
+ template <typename LhsPacketType>
467
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const {
426
468
  dest = pload<LhsPacketType>(a);
427
469
  }
428
470
 
429
- template<typename LhsPacketType>
430
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
431
- {
471
+ template <typename LhsPacketType>
472
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
432
473
  dest = ploadu<LhsPacketType>(a);
433
474
  }
434
475
 
435
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
436
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
437
- {
438
- conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
476
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
477
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
478
+ const LaneIdType&) const {
479
+ conj_helper<LhsPacketType, RhsPacketType, ConjLhs, ConjRhs> cj;
439
480
  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
440
481
  // let gcc allocate the register in which to store the result of the pmul
441
482
  // (in the case where there is no FMA) gcc fails to figure out how to avoid
442
483
  // spilling register.
443
484
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
444
485
  EIGEN_UNUSED_VARIABLE(tmp);
445
- c = cj.pmadd(a,b,c);
486
+ c = cj.pmadd(a, b, c);
446
487
  #else
447
- tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
488
+ tmp = b;
489
+ tmp = cj.pmul(a, tmp);
490
+ c = padd(c, tmp);
448
491
  #endif
449
492
  }
450
493
 
451
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
452
- {
453
- r = pmadd(c,alpha,r);
494
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
495
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
496
+ const LaneIdType& lane) const {
497
+ madd(a, b.get(lane), c, tmp, lane);
454
498
  }
455
-
456
- template<typename ResPacketHalf>
457
- EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
458
- {
459
- r = pmadd(c,alpha,r);
499
+
500
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
501
+ r = pmadd(c, alpha, r);
460
502
  }
461
503
 
504
+ template <typename ResPacketHalf>
505
+ EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const {
506
+ r = pmadd(c, alpha, r);
507
+ }
462
508
  };
463
509
 
464
- template<typename RealScalar, bool _ConjLhs>
465
- class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
466
- {
467
- public:
510
+ template <typename RealScalar, bool ConjLhs_, int Arch, int PacketSize_>
511
+ class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_> {
512
+ public:
468
513
  typedef std::complex<RealScalar> LhsScalar;
469
514
  typedef RealScalar RhsScalar;
470
515
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
471
516
 
517
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
518
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
519
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
520
+
472
521
  enum {
473
- ConjLhs = _ConjLhs,
522
+ ConjLhs = ConjLhs_,
474
523
  ConjRhs = false,
475
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
476
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
477
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
478
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
479
-
524
+ Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
525
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
526
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
527
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
528
+
480
529
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
481
530
  nr = 4,
482
531
  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
483
532
  // we assume 16 registers
484
- mr = 3*LhsPacketSize,
533
+ mr = 3 * LhsPacketSize,
485
534
  #else
486
- mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
535
+ mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
487
536
  #endif
488
537
 
489
538
  LhsProgress = LhsPacketSize,
490
539
  RhsProgress = 1
491
540
  };
492
541
 
493
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
494
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
495
- typedef typename packet_traits<ResScalar>::type _ResPacket;
542
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
543
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
544
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
545
+ typedef LhsPacket LhsPacket4Packing;
496
546
 
497
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
498
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
499
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
547
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
500
548
 
501
549
  typedef ResPacket AccPacket;
502
550
 
503
- EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
504
- {
505
- p = pset1<ResPacket>(ResScalar(0));
551
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
552
+
553
+ template <typename RhsPacketType>
554
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
555
+ dest = pset1<RhsPacketType>(*b);
506
556
  }
507
557
 
508
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
509
- {
510
- dest = pset1<RhsPacket>(*b);
558
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
559
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
511
560
  }
512
-
513
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
514
- {
515
- dest = pset1<RhsPacket>(*b);
561
+
562
+ template <typename RhsPacketType>
563
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
564
+ loadRhs(b, dest);
565
+ }
566
+
567
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
568
+
569
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const {
570
+ loadRhsQuad_impl(b, dest, std::conditional_t<RhsPacketSize == 16, true_type, false_type>());
516
571
  }
517
572
 
518
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
519
- {
520
- dest = pload<LhsPacket>(a);
573
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const {
574
+ // FIXME we can do better!
575
+ // what we want here is a ploadheight
576
+ RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
577
+ dest = ploadquad<RhsPacket>(tmp);
521
578
  }
522
579
 
523
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
524
- {
525
- dest = ploadu<LhsPacket>(a);
580
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const {
581
+ eigen_internal_assert(RhsPacketSize <= 8);
582
+ dest = pset1<RhsPacket>(*b);
526
583
  }
527
584
 
528
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
529
- {
530
- pbroadcast4(b, b0, b1, b2, b3);
585
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload<LhsPacket>(a); }
586
+
587
+ template <typename LhsPacketType>
588
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
589
+ dest = ploadu<LhsPacketType>(a);
531
590
  }
532
-
533
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
534
- // {
535
- // pbroadcast2(b, b0, b1);
536
- // }
537
591
 
538
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
539
- {
540
- madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
592
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
593
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
594
+ const LaneIdType&) const {
595
+ madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
541
596
  }
542
597
 
543
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
544
- {
598
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
599
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
600
+ RhsPacketType& tmp, const true_type&) const {
545
601
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
546
602
  EIGEN_UNUSED_VARIABLE(tmp);
547
- c.v = pmadd(a.v,b,c.v);
603
+ c.v = pmadd(a.v, b, c.v);
548
604
  #else
549
- tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
605
+ tmp = b;
606
+ tmp = pmul(a.v, tmp);
607
+ c.v = padd(c.v, tmp);
550
608
  #endif
551
609
  }
552
610
 
553
- EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
554
- {
611
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
612
+ const false_type&) const {
555
613
  c += a * b;
556
614
  }
557
615
 
558
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
559
- {
560
- r = cj.pmadd(c,alpha,r);
616
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
617
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
618
+ const LaneIdType& lane) const {
619
+ madd(a, b.get(lane), c, tmp, lane);
620
+ }
621
+
622
+ template <typename ResPacketType, typename AccPacketType>
623
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
624
+ conj_helper<ResPacketType, ResPacketType, ConjLhs, false> cj;
625
+ r = cj.pmadd(c, alpha, r);
561
626
  }
562
627
 
563
- protected:
564
- conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
628
+ protected:
565
629
  };
566
630
 
567
- template<typename Packet>
568
- struct DoublePacket
569
- {
631
+ template <typename Packet>
632
+ struct DoublePacket {
570
633
  Packet first;
571
634
  Packet second;
572
635
  };
573
636
 
574
- template<typename Packet>
575
- DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
576
- {
637
+ template <typename Packet>
638
+ DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Packet>& b) {
577
639
  DoublePacket<Packet> res;
578
- res.first = padd(a.first, b.first);
579
- res.second = padd(a.second,b.second);
640
+ res.first = padd(a.first, b.first);
641
+ res.second = padd(a.second, b.second);
580
642
  return res;
581
643
  }
582
644
 
583
- template<typename Packet>
584
- const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
585
- {
645
+ // note that for DoublePacket<RealPacket> the "4" in "downto4"
646
+ // corresponds to the number of complexes, so it means "8"
647
+ // it terms of real coefficients.
648
+
649
+ template <typename Packet>
650
+ const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
651
+ std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
586
652
  return a;
587
653
  }
588
654
 
589
- template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
655
+ template <typename Packet>
656
+ DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
657
+ const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
658
+ // yes, that's pretty hackish :(
659
+ DoublePacket<typename unpacket_traits<Packet>::half> res;
660
+ typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
661
+ typedef typename packet_traits<Cplx>::type CplxPacket;
662
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
663
+ res.second = predux_half_dowto4(CplxPacket(a.second)).v;
664
+ return res;
665
+ }
666
+
667
+ // same here, "quad" actually means "8" in terms of real coefficients
668
+ template <typename Scalar, typename RealPacket>
669
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
670
+ std::enable_if_t<unpacket_traits<RealPacket>::size <= 8>* = 0) {
671
+ dest.first = pset1<RealPacket>(numext::real(*b));
672
+ dest.second = pset1<RealPacket>(numext::imag(*b));
673
+ }
674
+
675
+ template <typename Scalar, typename RealPacket>
676
+ void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
677
+ std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
678
+ // yes, that's pretty hackish too :(
679
+ typedef typename NumTraits<Scalar>::Real RealScalar;
680
+ RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
681
+ RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
682
+ dest.first = ploadquad<RealPacket>(r);
683
+ dest.second = ploadquad<RealPacket>(i);
684
+ }
685
+
686
+ template <typename Packet>
687
+ struct unpacket_traits<DoublePacket<Packet> > {
688
+ typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
689
+ enum { size = 2 * unpacket_traits<Packet>::size };
690
+ };
590
691
  // template<typename Packet>
591
692
  // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
592
693
  // {
@@ -596,356 +697,1023 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
596
697
  // return res;
597
698
  // }
598
699
 
599
- template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
600
- class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
601
- {
602
- public:
603
- typedef std::complex<RealScalar> Scalar;
604
- typedef std::complex<RealScalar> LhsScalar;
605
- typedef std::complex<RealScalar> RhsScalar;
606
- typedef std::complex<RealScalar> ResScalar;
607
-
700
+ template <typename RealScalar, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
701
+ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_> {
702
+ public:
703
+ typedef std::complex<RealScalar> Scalar;
704
+ typedef std::complex<RealScalar> LhsScalar;
705
+ typedef std::complex<RealScalar> RhsScalar;
706
+ typedef std::complex<RealScalar> ResScalar;
707
+
708
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
709
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
710
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
711
+ PACKET_DECL_COND(Real, PacketSize_);
712
+ PACKET_DECL_COND_SCALAR(PacketSize_);
713
+
608
714
  enum {
609
- ConjLhs = _ConjLhs,
610
- ConjRhs = _ConjRhs,
611
- Vectorizable = packet_traits<RealScalar>::Vectorizable
612
- && packet_traits<Scalar>::Vectorizable,
613
- RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
614
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
615
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
616
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
715
+ ConjLhs = ConjLhs_,
716
+ ConjRhs = ConjRhs_,
717
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable && unpacket_traits<ScalarPacket>::vectorizable,
718
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
719
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
720
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
721
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
722
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
617
723
 
618
- // FIXME: should depend on NumberOfRegisters
619
724
  nr = 4,
620
- mr = ResPacketSize,
725
+ mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
621
726
 
622
727
  LhsProgress = ResPacketSize,
623
728
  RhsProgress = 1
624
729
  };
625
-
626
- typedef typename packet_traits<RealScalar>::type RealPacket;
627
- typedef typename packet_traits<Scalar>::type ScalarPacket;
730
+
628
731
  typedef DoublePacket<RealPacket> DoublePacketType;
629
732
 
630
- typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
631
- typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
632
- typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
633
- typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
634
-
733
+ typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> LhsPacket4Packing;
734
+ typedef std::conditional_t<Vectorizable, RealPacket, Scalar> LhsPacket;
735
+ typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> RhsPacket;
736
+ typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> ResPacket;
737
+ typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> AccPacket;
738
+
739
+ // this actually holds 8 packets!
740
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
741
+
635
742
  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
636
743
 
637
- EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
638
- {
639
- p.first = pset1<RealPacket>(RealScalar(0));
640
- p.second = pset1<RealPacket>(RealScalar(0));
744
+ EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) {
745
+ p.first = pset1<RealPacket>(RealScalar(0));
746
+ p.second = pset1<RealPacket>(RealScalar(0));
641
747
  }
642
748
 
643
749
  // Scalar path
644
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
645
- {
646
- dest = pset1<ResPacket>(*b);
647
- }
750
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1<ScalarPacket>(*b); }
648
751
 
649
752
  // Vectorized path
650
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
651
- {
652
- dest.first = pset1<RealPacket>(numext::real(*b));
653
- dest.second = pset1<RealPacket>(numext::imag(*b));
654
- }
655
-
656
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
657
- {
658
- loadRhs(b,dest);
659
- }
660
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
661
- {
662
- eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
663
- loadRhs(b,dest);
664
- }
665
-
666
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
667
- {
668
- // FIXME not sure that's the best way to implement it!
669
- loadRhs(b+0, b0);
670
- loadRhs(b+1, b1);
671
- loadRhs(b+2, b2);
672
- loadRhs(b+3, b3);
673
- }
674
-
675
- // Vectorized path
676
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
677
- {
678
- // FIXME not sure that's the best way to implement it!
679
- loadRhs(b+0, b0);
680
- loadRhs(b+1, b1);
753
+ template <typename RealPacketType>
754
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
755
+ dest.first = pset1<RealPacketType>(numext::real(*b));
756
+ dest.second = pset1<RealPacketType>(numext::imag(*b));
681
757
  }
682
-
758
+
759
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
760
+ loadRhs(b, dest.B_0);
761
+ loadRhs(b + 1, dest.B1);
762
+ loadRhs(b + 2, dest.B2);
763
+ loadRhs(b + 3, dest.B3);
764
+ }
765
+
683
766
  // Scalar path
684
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
685
- {
686
- // FIXME not sure that's the best way to implement it!
687
- loadRhs(b+0, b0);
688
- loadRhs(b+1, b1);
767
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); }
768
+
769
+ // Vectorized path
770
+ template <typename RealPacketType>
771
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
772
+ loadRhs(b, dest);
773
+ }
774
+
775
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
776
+
777
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); }
778
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const {
779
+ loadQuadToDoublePacket(b, dest);
689
780
  }
690
781
 
691
782
  // nothing special here
692
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
693
- {
783
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const {
694
784
  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
695
785
  }
696
786
 
697
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
698
- {
699
- dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
787
+ template <typename LhsPacketType>
788
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
789
+ dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
700
790
  }
701
791
 
702
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
703
- {
704
- c.first = padd(pmul(a,b.first), c.first);
705
- c.second = padd(pmul(a,b.second),c.second);
792
+ template <typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType,
793
+ typename LaneIdType>
794
+ EIGEN_STRONG_INLINE std::enable_if_t<!is_same<RhsPacketType, RhsPacketx4>::value> madd(const LhsPacketType& a,
795
+ const RhsPacketType& b,
796
+ DoublePacket<ResPacketType>& c,
797
+ TmpType& /*tmp*/,
798
+ const LaneIdType&) const {
799
+ c.first = pmadd(a, b.first, c.first);
800
+ c.second = pmadd(a, b.second, c.second);
706
801
  }
707
802
 
708
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
709
- {
710
- c = cj.pmadd(a,b,c);
803
+ template <typename LaneIdType>
804
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/,
805
+ const LaneIdType&) const {
806
+ c = cj.pmadd(a, b, c);
711
807
  }
712
-
808
+
809
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
810
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
811
+ const LaneIdType& lane) const {
812
+ madd(a, b.get(lane), c, tmp, lane);
813
+ }
814
+
713
815
  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
714
-
715
- EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
716
- {
816
+
817
+ template <typename RealPacketType, typename ResPacketType>
818
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha,
819
+ ResPacketType& r) const {
717
820
  // assemble c
718
- ResPacket tmp;
719
- if((!ConjLhs)&&(!ConjRhs))
720
- {
721
- tmp = pcplxflip(pconj(ResPacket(c.second)));
722
- tmp = padd(ResPacket(c.first),tmp);
723
- }
724
- else if((!ConjLhs)&&(ConjRhs))
725
- {
726
- tmp = pconj(pcplxflip(ResPacket(c.second)));
727
- tmp = padd(ResPacket(c.first),tmp);
728
- }
729
- else if((ConjLhs)&&(!ConjRhs))
730
- {
731
- tmp = pcplxflip(ResPacket(c.second));
732
- tmp = padd(pconj(ResPacket(c.first)),tmp);
733
- }
734
- else if((ConjLhs)&&(ConjRhs))
735
- {
736
- tmp = pcplxflip(ResPacket(c.second));
737
- tmp = psub(pconj(ResPacket(c.first)),tmp);
821
+ ResPacketType tmp;
822
+ if ((!ConjLhs) && (!ConjRhs)) {
823
+ tmp = pcplxflip(pconj(ResPacketType(c.second)));
824
+ tmp = padd(ResPacketType(c.first), tmp);
825
+ } else if ((!ConjLhs) && (ConjRhs)) {
826
+ tmp = pconj(pcplxflip(ResPacketType(c.second)));
827
+ tmp = padd(ResPacketType(c.first), tmp);
828
+ } else if ((ConjLhs) && (!ConjRhs)) {
829
+ tmp = pcplxflip(ResPacketType(c.second));
830
+ tmp = padd(pconj(ResPacketType(c.first)), tmp);
831
+ } else if ((ConjLhs) && (ConjRhs)) {
832
+ tmp = pcplxflip(ResPacketType(c.second));
833
+ tmp = psub(pconj(ResPacketType(c.first)), tmp);
738
834
  }
739
-
740
- r = pmadd(tmp,alpha,r);
835
+
836
+ r = pmadd(tmp, alpha, r);
741
837
  }
742
838
 
743
- protected:
744
- conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
839
+ protected:
840
+ conj_helper<LhsScalar, RhsScalar, ConjLhs, ConjRhs> cj;
745
841
  };
746
842
 
747
- template<typename RealScalar, bool _ConjRhs>
748
- class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
749
- {
750
- public:
751
- typedef std::complex<RealScalar> Scalar;
752
- typedef RealScalar LhsScalar;
753
- typedef Scalar RhsScalar;
754
- typedef Scalar ResScalar;
843
+ template <typename RealScalar, bool ConjRhs_, int Arch, int PacketSize_>
844
+ class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_> {
845
+ public:
846
+ typedef std::complex<RealScalar> Scalar;
847
+ typedef RealScalar LhsScalar;
848
+ typedef Scalar RhsScalar;
849
+ typedef Scalar ResScalar;
850
+
851
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
852
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
853
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
854
+ PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_);
855
+ PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_);
856
+
857
+ #undef PACKET_DECL_COND_SCALAR_POSTFIX
858
+ #undef PACKET_DECL_COND_POSTFIX
859
+ #undef PACKET_DECL_COND_SCALAR
860
+ #undef PACKET_DECL_COND
755
861
 
756
862
  enum {
757
863
  ConjLhs = false,
758
- ConjRhs = _ConjRhs,
759
- Vectorizable = packet_traits<RealScalar>::Vectorizable
760
- && packet_traits<Scalar>::Vectorizable,
761
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
762
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
763
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
764
-
864
+ ConjRhs = ConjRhs_,
865
+ Vectorizable = unpacket_traits<RealPacket_>::vectorizable && unpacket_traits<ScalarPacket_>::vectorizable,
866
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
867
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
868
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
869
+
765
870
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
766
871
  // FIXME: should depend on NumberOfRegisters
767
872
  nr = 4,
768
- mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
873
+ mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
769
874
 
770
875
  LhsProgress = ResPacketSize,
771
876
  RhsProgress = 1
772
877
  };
773
878
 
774
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
775
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
776
- typedef typename packet_traits<ResScalar>::type _ResPacket;
777
-
778
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
779
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
780
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
781
-
879
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
880
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
881
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
882
+ typedef LhsPacket LhsPacket4Packing;
883
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
782
884
  typedef ResPacket AccPacket;
783
885
 
784
- EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
785
- {
786
- p = pset1<ResPacket>(ResScalar(0));
787
- }
886
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
788
887
 
789
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
790
- {
791
- dest = pset1<RhsPacket>(*b);
792
- }
793
-
794
- void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
795
- {
796
- pbroadcast4(b, b0, b1, b2, b3);
888
+ template <typename RhsPacketType>
889
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
890
+ dest = pset1<RhsPacketType>(*b);
797
891
  }
798
-
799
- // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
800
- // {
801
- // // FIXME not sure that's the best way to implement it!
802
- // b0 = pload1<RhsPacket>(b+0);
803
- // b1 = pload1<RhsPacket>(b+1);
804
- // }
805
892
 
806
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
807
- {
808
- dest = ploaddup<LhsPacket>(a);
893
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
894
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
809
895
  }
810
-
811
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
812
- {
813
- eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
814
- loadRhs(b,dest);
896
+
897
+ template <typename RhsPacketType>
898
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
899
+ loadRhs(b, dest);
815
900
  }
816
901
 
817
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
818
- {
819
- dest = ploaddup<LhsPacket>(a);
902
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
903
+
904
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); }
905
+
906
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
907
+
908
+ template <typename LhsPacketType>
909
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
910
+ dest = ploaddup<LhsPacketType>(a);
820
911
  }
821
912
 
822
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
823
- {
824
- madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
913
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
914
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
915
+ const LaneIdType&) const {
916
+ madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
825
917
  }
826
918
 
827
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
828
- {
919
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
920
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
921
+ RhsPacketType& tmp, const true_type&) const {
829
922
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
830
923
  EIGEN_UNUSED_VARIABLE(tmp);
831
- c.v = pmadd(a,b.v,c.v);
924
+ c.v = pmadd(a, b.v, c.v);
832
925
  #else
833
- tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
926
+ tmp = b;
927
+ tmp.v = pmul(a, tmp.v);
928
+ c = padd(c, tmp);
834
929
  #endif
835
-
836
930
  }
837
931
 
838
- EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
839
- {
932
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
933
+ const false_type&) const {
840
934
  c += a * b;
841
935
  }
842
936
 
843
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
844
- {
845
- r = cj.pmadd(alpha,c,r);
937
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
938
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
939
+ const LaneIdType& lane) const {
940
+ madd(a, b.get(lane), c, tmp, lane);
941
+ }
942
+
943
+ template <typename ResPacketType, typename AccPacketType>
944
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
945
+ conj_helper<ResPacketType, ResPacketType, false, ConjRhs> cj;
946
+ r = cj.pmadd(alpha, c, r);
846
947
  }
847
948
 
848
- protected:
849
- conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
949
+ protected:
850
950
  };
851
951
 
852
- /* optimized GEneral packed Block * packed Panel product kernel
952
+ /* optimized General packed Block * packed Panel product kernel
853
953
  *
854
954
  * Mixing type logic: C += A * B
855
955
  * | A | B | comments
856
956
  * |real |cplx | no vectorization yet, would require to pack A with duplication
857
957
  * |cplx |real | easy vectorization
858
958
  */
859
- template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
860
- struct gebp_kernel
861
- {
862
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
959
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
960
+ bool ConjugateLhs, bool ConjugateRhs>
961
+ struct gebp_kernel {
962
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
963
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf>
964
+ HalfTraits;
965
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter>
966
+ QuarterTraits;
967
+
863
968
  typedef typename Traits::ResScalar ResScalar;
864
969
  typedef typename Traits::LhsPacket LhsPacket;
865
970
  typedef typename Traits::RhsPacket RhsPacket;
866
971
  typedef typename Traits::ResPacket ResPacket;
867
972
  typedef typename Traits::AccPacket AccPacket;
973
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
974
+
975
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
976
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type RhsPanel27;
977
+
978
+ typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
868
979
 
869
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
870
980
  typedef typename SwappedTraits::ResScalar SResScalar;
871
981
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
872
982
  typedef typename SwappedTraits::RhsPacket SRhsPacket;
873
983
  typedef typename SwappedTraits::ResPacket SResPacket;
874
984
  typedef typename SwappedTraits::AccPacket SAccPacket;
875
985
 
986
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
987
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
988
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
989
+ typedef typename HalfTraits::AccPacket AccPacketHalf;
990
+
991
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
992
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
993
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
994
+ typedef typename QuarterTraits::AccPacket AccPacketQuarter;
995
+
876
996
  typedef typename DataMapper::LinearMapper LinearMapper;
877
997
 
878
998
  enum {
879
- Vectorizable = Traits::Vectorizable,
880
- LhsProgress = Traits::LhsProgress,
881
- RhsProgress = Traits::RhsProgress,
999
+ Vectorizable = Traits::Vectorizable,
1000
+ LhsProgress = Traits::LhsProgress,
1001
+ LhsProgressHalf = HalfTraits::LhsProgress,
1002
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
1003
+ RhsProgress = Traits::RhsProgress,
1004
+ RhsProgressHalf = HalfTraits::RhsProgress,
1005
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
882
1006
  ResPacketSize = Traits::ResPacketSize
883
1007
  };
884
1008
 
885
- EIGEN_DONT_INLINE
886
- void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
887
- Index rows, Index depth, Index cols, ResScalar alpha,
888
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
1009
+ EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
1010
+ Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
1011
+ Index offsetA = 0, Index offsetB = 0);
889
1012
  };
890
1013
 
891
- template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
892
- EIGEN_DONT_INLINE
893
- void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
894
- ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
895
- Index rows, Index depth, Index cols, ResScalar alpha,
896
- Index strideA, Index strideB, Index offsetA, Index offsetB)
897
- {
898
- Traits traits;
899
- SwappedTraits straits;
900
-
901
- if(strideA==-1) strideA = depth;
902
- if(strideB==-1) strideB = depth;
903
- conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
904
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
905
- const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
906
- const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
907
- const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
908
- enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
909
- const Index peeled_kc = depth & ~(pk-1);
910
- const Index prefetch_res_offset = 32/sizeof(ResScalar);
911
- // const Index depth2 = depth & ~1;
912
-
913
- //---------- Process 3 * LhsProgress rows at once ----------
914
- // This corresponds to 3*LhsProgress x nr register blocks.
915
- // Usually, make sense only with FMA
916
- if(mr>=3*Traits::LhsProgress)
917
- {
918
- // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
919
- // and on each largest micro vertical panel of the rhs (depth * nr).
920
- // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
921
- // However, if depth is too small, we can extend the number of rows of these horizontal panels.
922
- // This actual number of rows is computed as follow:
923
- const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
924
- // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
925
- // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
926
- // or because we are testing specific blocking sizes.
927
- const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
928
- for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
929
- {
930
- const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
931
- for(Index j2=0; j2<packet_cols4; j2+=nr)
932
- {
933
- for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
934
- {
935
-
1014
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1015
+ bool ConjugateLhs, bool ConjugateRhs,
1016
+ int SwappedLhsProgress =
1017
+ gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target>::LhsProgress>
1018
+ struct last_row_process_16_packets {
1019
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
1020
+ typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
1021
+
1022
+ typedef typename Traits::ResScalar ResScalar;
1023
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1024
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1025
+ typedef typename SwappedTraits::ResPacket SResPacket;
1026
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1027
+
1028
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
1029
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1030
+ ResScalar alpha, SAccPacket& C0) {
1031
+ EIGEN_UNUSED_VARIABLE(res);
1032
+ EIGEN_UNUSED_VARIABLE(straits);
1033
+ EIGEN_UNUSED_VARIABLE(blA);
1034
+ EIGEN_UNUSED_VARIABLE(blB);
1035
+ EIGEN_UNUSED_VARIABLE(depth);
1036
+ EIGEN_UNUSED_VARIABLE(endk);
1037
+ EIGEN_UNUSED_VARIABLE(i);
1038
+ EIGEN_UNUSED_VARIABLE(j2);
1039
+ EIGEN_UNUSED_VARIABLE(alpha);
1040
+ EIGEN_UNUSED_VARIABLE(C0);
1041
+ }
1042
+ };
1043
+
1044
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1045
+ bool ConjugateLhs, bool ConjugateRhs>
1046
+ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1047
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
1048
+ typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
1049
+
1050
+ typedef typename Traits::ResScalar ResScalar;
1051
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
1052
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
1053
+ typedef typename SwappedTraits::ResPacket SResPacket;
1054
+ typedef typename SwappedTraits::AccPacket SAccPacket;
1055
+
1056
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
1057
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1058
+ ResScalar alpha, SAccPacket& C0) {
1059
+ typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1060
+ typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1061
+ typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1062
+ typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1063
+
1064
+ SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1065
+ SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1066
+
1067
+ if (depth - endk > 0) {
1068
+ // We have to handle the last row(s) of the rhs, which
1069
+ // correspond to a half-packet
1070
+ SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1071
+
1072
+ for (Index kk = endk; kk < depth; kk++) {
1073
+ SLhsPacketQuarter a0;
1074
+ SRhsPacketQuarter b0;
1075
+ straits.loadLhsUnaligned(blB, a0);
1076
+ straits.loadRhs(blA, b0);
1077
+ straits.madd(a0, b0, c0, b0, fix<0>);
1078
+ blB += SwappedTraits::LhsProgress / 4;
1079
+ blA += 1;
1080
+ }
1081
+ straits.acc(c0, alphav, R);
1082
+ } else {
1083
+ straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1084
+ }
1085
+ res.scatterPacket(i, j2, R);
1086
+ }
1087
+ };
1088
+
1089
+ template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
1090
+ typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
1091
+ typename LinearMapper, typename DataMapper>
1092
+ struct lhs_process_one_packet {
1093
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1094
+
1095
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
1096
+ LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
1097
+ AccPacket* C1, AccPacket* C2, AccPacket* C3) {
1098
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1099
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1100
+ traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
1101
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
1102
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1103
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1104
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1105
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1106
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1107
+ __asm__("" : "+x,m"(*A0));
1108
+ #endif
1109
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1110
+ }
1111
+
1112
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1113
+ ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
1114
+ Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
1115
+ Index cols, Index depth, Index packet_cols4) {
1116
+ GEBPTraits traits;
1117
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1118
+ // loops on each largest micro horizontal panel of lhs
1119
+ // (LhsProgress x depth)
1120
+ for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
1121
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1122
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1123
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1124
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1125
+ prefetch(&blA[0]);
1126
+
1127
+ // gets res block as register
1128
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
1129
+ traits.initAcc(C0);
1130
+ traits.initAcc(C1);
1131
+ traits.initAcc(C2);
1132
+ traits.initAcc(C3);
1133
+ traits.initAcc(C4);
1134
+ traits.initAcc(C5);
1135
+ traits.initAcc(C6);
1136
+ traits.initAcc(C7);
1137
+
1138
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1139
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1140
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1141
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1142
+ LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1143
+ LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1144
+ LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1145
+ LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1146
+ r0.prefetch(prefetch_res_offset);
1147
+ r1.prefetch(prefetch_res_offset);
1148
+ r2.prefetch(prefetch_res_offset);
1149
+ r3.prefetch(prefetch_res_offset);
1150
+ r4.prefetch(prefetch_res_offset);
1151
+ r5.prefetch(prefetch_res_offset);
1152
+ r6.prefetch(prefetch_res_offset);
1153
+ r7.prefetch(prefetch_res_offset);
1154
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1155
+ prefetch(&blB[0]);
1156
+
1157
+ LhsPacket A0;
1158
+ for (Index k = 0; k < peeled_kc; k += pk) {
1159
+ RhsPacketx4 rhs_panel;
1160
+ RhsPacket T0;
1161
+ #define EIGEN_GEBGP_ONESTEP(K) \
1162
+ do { \
1163
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1164
+ traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1165
+ traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1166
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1167
+ traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1168
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1169
+ traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1170
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1171
+ traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1172
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1173
+ traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1174
+ traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1175
+ traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1176
+ traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1177
+ traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1178
+ traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1179
+ traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1180
+ traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1181
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1182
+ } while (false)
1183
+
1184
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
1185
+
1186
+ EIGEN_GEBGP_ONESTEP(0);
1187
+ EIGEN_GEBGP_ONESTEP(1);
1188
+ EIGEN_GEBGP_ONESTEP(2);
1189
+ EIGEN_GEBGP_ONESTEP(3);
1190
+ EIGEN_GEBGP_ONESTEP(4);
1191
+ EIGEN_GEBGP_ONESTEP(5);
1192
+ EIGEN_GEBGP_ONESTEP(6);
1193
+ EIGEN_GEBGP_ONESTEP(7);
1194
+
1195
+ blB += pk * 8 * RhsProgress;
1196
+ blA += pk * (1 * LhsProgress);
1197
+
1198
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
1199
+ }
1200
+ // process remaining peeled loop
1201
+ for (Index k = peeled_kc; k < depth; k++) {
1202
+ RhsPacketx4 rhs_panel;
1203
+ RhsPacket T0;
1204
+ EIGEN_GEBGP_ONESTEP(0);
1205
+ blB += 8 * RhsProgress;
1206
+ blA += 1 * LhsProgress;
1207
+ }
1208
+
1209
+ #undef EIGEN_GEBGP_ONESTEP
1210
+
1211
+ ResPacket R0, R1;
1212
+ ResPacket alphav = pset1<ResPacket>(alpha);
1213
+
1214
+ R0 = r0.template loadPacket<ResPacket>(0);
1215
+ R1 = r1.template loadPacket<ResPacket>(0);
1216
+ traits.acc(C0, alphav, R0);
1217
+ traits.acc(C1, alphav, R1);
1218
+ r0.storePacket(0, R0);
1219
+ r1.storePacket(0, R1);
1220
+
1221
+ R0 = r2.template loadPacket<ResPacket>(0);
1222
+ R1 = r3.template loadPacket<ResPacket>(0);
1223
+ traits.acc(C2, alphav, R0);
1224
+ traits.acc(C3, alphav, R1);
1225
+ r2.storePacket(0, R0);
1226
+ r3.storePacket(0, R1);
1227
+
1228
+ R0 = r4.template loadPacket<ResPacket>(0);
1229
+ R1 = r5.template loadPacket<ResPacket>(0);
1230
+ traits.acc(C4, alphav, R0);
1231
+ traits.acc(C5, alphav, R1);
1232
+ r4.storePacket(0, R0);
1233
+ r5.storePacket(0, R1);
1234
+
1235
+ R0 = r6.template loadPacket<ResPacket>(0);
1236
+ R1 = r7.template loadPacket<ResPacket>(0);
1237
+ traits.acc(C6, alphav, R0);
1238
+ traits.acc(C7, alphav, R1);
1239
+ r6.storePacket(0, R0);
1240
+ r7.storePacket(0, R1);
1241
+ }
1242
+ }
1243
+ #endif
1244
+
1245
+ // loops on each largest micro vertical panel of rhs (depth * nr)
1246
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1247
+ // We select a LhsProgress x nr micro block of res
1248
+ // which is entirely stored into 1 x nr registers.
1249
+
1250
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1251
+ prefetch(&blA[0]);
1252
+
1253
+ // gets res block as register
1254
+ AccPacket C0, C1, C2, C3;
1255
+ traits.initAcc(C0);
1256
+ traits.initAcc(C1);
1257
+ traits.initAcc(C2);
1258
+ traits.initAcc(C3);
1259
+ // To improve instruction pipelining, let's double the accumulation registers:
1260
+ // even k will accumulate in C*, while odd k will accumulate in D*.
1261
+ // This trick is crucial to get good performance with FMA, otherwise it is
1262
+ // actually faster to perform separated MUL+ADD because of a naturally
1263
+ // better instruction-level parallelism.
1264
+ AccPacket D0, D1, D2, D3;
1265
+ traits.initAcc(D0);
1266
+ traits.initAcc(D1);
1267
+ traits.initAcc(D2);
1268
+ traits.initAcc(D3);
1269
+
1270
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1271
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1272
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1273
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1274
+
1275
+ r0.prefetch(prefetch_res_offset);
1276
+ r1.prefetch(prefetch_res_offset);
1277
+ r2.prefetch(prefetch_res_offset);
1278
+ r3.prefetch(prefetch_res_offset);
1279
+
1280
+ // performs "inner" products
1281
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1282
+ prefetch(&blB[0]);
1283
+ LhsPacket A0, A1;
1284
+
1285
+ for (Index k = 0; k < peeled_kc; k += pk) {
1286
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1287
+ RhsPacketx4 rhs_panel;
1288
+ RhsPacket T0;
1289
+
1290
+ internal::prefetch(blB + (48 + 0));
1291
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1292
+ peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1293
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1294
+ peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1295
+ internal::prefetch(blB + (48 + 16));
1296
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1297
+ peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1298
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1299
+ peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1300
+
1301
+ blB += pk * 4 * RhsProgress;
1302
+ blA += pk * LhsProgress;
1303
+
1304
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1305
+ }
1306
+ C0 = padd(C0, D0);
1307
+ C1 = padd(C1, D1);
1308
+ C2 = padd(C2, D2);
1309
+ C3 = padd(C3, D3);
1310
+
1311
+ // process remaining peeled loop
1312
+ for (Index k = peeled_kc; k < depth; k++) {
1313
+ RhsPacketx4 rhs_panel;
1314
+ RhsPacket T0;
1315
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1316
+ blB += 4 * RhsProgress;
1317
+ blA += LhsProgress;
1318
+ }
1319
+
1320
+ ResPacket R0, R1;
1321
+ ResPacket alphav = pset1<ResPacket>(alpha);
1322
+
1323
+ R0 = r0.template loadPacket<ResPacket>(0);
1324
+ R1 = r1.template loadPacket<ResPacket>(0);
1325
+ traits.acc(C0, alphav, R0);
1326
+ traits.acc(C1, alphav, R1);
1327
+ r0.storePacket(0, R0);
1328
+ r1.storePacket(0, R1);
1329
+
1330
+ R0 = r2.template loadPacket<ResPacket>(0);
1331
+ R1 = r3.template loadPacket<ResPacket>(0);
1332
+ traits.acc(C2, alphav, R0);
1333
+ traits.acc(C3, alphav, R1);
1334
+ r2.storePacket(0, R0);
1335
+ r3.storePacket(0, R1);
1336
+ }
1337
+
1338
+ // Deal with remaining columns of the rhs
1339
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
1340
+ // One column at a time
1341
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1342
+ prefetch(&blA[0]);
1343
+
1344
+ // gets res block as register
1345
+ AccPacket C0;
1346
+ traits.initAcc(C0);
1347
+
1348
+ LinearMapper r0 = res.getLinearMapper(i, j2);
1349
+
1350
+ // performs "inner" products
1351
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1352
+ LhsPacket A0;
1353
+
1354
+ for (Index k = 0; k < peeled_kc; k += pk) {
1355
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1356
+ RhsPacket B_0;
1357
+
1358
+ #define EIGEN_GEBGP_ONESTEP(K) \
1359
+ do { \
1360
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1361
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1362
+ /* FIXME: why unaligned???? */ \
1363
+ traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
1364
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1365
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1366
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1367
+ } while (false);
1368
+
1369
+ EIGEN_GEBGP_ONESTEP(0);
1370
+ EIGEN_GEBGP_ONESTEP(1);
1371
+ EIGEN_GEBGP_ONESTEP(2);
1372
+ EIGEN_GEBGP_ONESTEP(3);
1373
+ EIGEN_GEBGP_ONESTEP(4);
1374
+ EIGEN_GEBGP_ONESTEP(5);
1375
+ EIGEN_GEBGP_ONESTEP(6);
1376
+ EIGEN_GEBGP_ONESTEP(7);
1377
+
1378
+ blB += pk * RhsProgress;
1379
+ blA += pk * LhsProgress;
1380
+
1381
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1382
+ }
1383
+
1384
+ // process remaining peeled loop
1385
+ for (Index k = peeled_kc; k < depth; k++) {
1386
+ RhsPacket B_0;
1387
+ EIGEN_GEBGP_ONESTEP(0);
1388
+ blB += RhsProgress;
1389
+ blA += LhsProgress;
1390
+ }
1391
+ #undef EIGEN_GEBGP_ONESTEP
1392
+ ResPacket R0;
1393
+ ResPacket alphav = pset1<ResPacket>(alpha);
1394
+ R0 = r0.template loadPacket<ResPacket>(0);
1395
+ traits.acc(C0, alphav, R0);
1396
+ r0.storePacket(0, R0);
1397
+ }
1398
+ }
1399
+ }
1400
+ };
1401
+
1402
+ template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
1403
+ typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
1404
+ typename LinearMapper, typename DataMapper>
1405
+ struct lhs_process_fraction_of_packet
1406
+ : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
1407
+ RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
1408
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
1409
+ LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
1410
+ AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
1411
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1412
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1413
+ traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
1414
+ traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
1415
+ traits.madd(*A0, *B_0, *C0, *B_0);
1416
+ traits.madd(*A0, *B1, *C1, *B1);
1417
+ traits.madd(*A0, *B2, *C2, *B2);
1418
+ traits.madd(*A0, *B3, *C3, *B3);
1419
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1420
+ }
1421
+ };
1422
+
1423
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1424
+ bool ConjugateLhs, bool ConjugateRhs>
1425
+ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs,
1426
+ ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
1427
+ const RhsScalar* blockB, Index rows, Index depth,
1428
+ Index cols, ResScalar alpha, Index strideA, Index strideB,
1429
+ Index offsetA, Index offsetB) {
1430
+ Traits traits;
1431
+ SwappedTraits straits;
1432
+
1433
+ if (strideA == -1) strideA = depth;
1434
+ if (strideB == -1) strideB = depth;
1435
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
1436
+ Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
1437
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1438
+ const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
1439
+ const Index peeled_mc2 =
1440
+ mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
1441
+ const Index peeled_mc1 =
1442
+ mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
1443
+ const Index peeled_mc_half =
1444
+ mr >= LhsProgressHalf ? peeled_mc1 + ((rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
1445
+ const Index peeled_mc_quarter =
1446
+ mr >= LhsProgressQuarter
1447
+ ? peeled_mc_half + ((rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
1448
+ : 0;
1449
+ enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1450
+ const Index peeled_kc = depth & ~(pk - 1);
1451
+ const int prefetch_res_offset = 32 / sizeof(ResScalar);
1452
+ // const Index depth2 = depth & ~1;
1453
+
1454
+ //---------- Process 3 * LhsProgress rows at once ----------
1455
+ // This corresponds to 3*LhsProgress x nr register blocks.
1456
+ // Usually, make sense only with FMA
1457
+ if (mr >= 3 * Traits::LhsProgress) {
1458
+ // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
1459
+ // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
1460
+ // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
1461
+ // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
1462
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1463
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1464
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1465
+ // guess), or because we are testing specific blocking sizes.
1466
+ const Index actual_panel_rows =
1467
+ (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1468
+ (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
1469
+ for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
1470
+ const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
1471
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1472
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1473
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1474
+ for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1475
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1476
+ prefetch(&blA[0]);
1477
+ // gets res block as register
1478
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
1479
+ C21, C22, C23;
1480
+ traits.initAcc(C0);
1481
+ traits.initAcc(C1);
1482
+ traits.initAcc(C2);
1483
+ traits.initAcc(C3);
1484
+ traits.initAcc(C4);
1485
+ traits.initAcc(C5);
1486
+ traits.initAcc(C6);
1487
+ traits.initAcc(C7);
1488
+ traits.initAcc(C8);
1489
+ traits.initAcc(C9);
1490
+ traits.initAcc(C10);
1491
+ traits.initAcc(C11);
1492
+ traits.initAcc(C12);
1493
+ traits.initAcc(C13);
1494
+ traits.initAcc(C14);
1495
+ traits.initAcc(C15);
1496
+ traits.initAcc(C16);
1497
+ traits.initAcc(C17);
1498
+ traits.initAcc(C18);
1499
+ traits.initAcc(C19);
1500
+ traits.initAcc(C20);
1501
+ traits.initAcc(C21);
1502
+ traits.initAcc(C22);
1503
+ traits.initAcc(C23);
1504
+
1505
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1506
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1507
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1508
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1509
+ LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1510
+ LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1511
+ LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1512
+ LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1513
+
1514
+ r0.prefetch(0);
1515
+ r1.prefetch(0);
1516
+ r2.prefetch(0);
1517
+ r3.prefetch(0);
1518
+ r4.prefetch(0);
1519
+ r5.prefetch(0);
1520
+ r6.prefetch(0);
1521
+ r7.prefetch(0);
1522
+
1523
+ // performs "inner" products
1524
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1525
+ prefetch(&blB[0]);
1526
+ LhsPacket A0, A1;
1527
+ for (Index k = 0; k < peeled_kc; k += pk) {
1528
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
1529
+ // 27 registers are taken (24 for acc, 3 for lhs).
1530
+ RhsPanel27 rhs_panel;
1531
+ RhsPacket T0;
1532
+ LhsPacket A2;
1533
+ #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1534
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1535
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1536
+ // which is not good for pipelining
1537
+ #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1538
+ #else
1539
+ #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1540
+ #endif
1541
+
1542
+ #define EIGEN_GEBP_ONESTEP(K) \
1543
+ do { \
1544
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1545
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1546
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1547
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1548
+ EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1549
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1550
+ traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1551
+ traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1552
+ traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1553
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1554
+ traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1555
+ traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1556
+ traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1557
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1558
+ traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1559
+ traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1560
+ traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1561
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1562
+ traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1563
+ traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1564
+ traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1565
+ traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1566
+ traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1567
+ traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1568
+ traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1569
+ traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1570
+ traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1571
+ traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1572
+ traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1573
+ traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1574
+ traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1575
+ traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1576
+ traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1577
+ traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1578
+ traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1579
+ traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1580
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1581
+ } while (false)
1582
+
1583
+ EIGEN_GEBP_ONESTEP(0);
1584
+ EIGEN_GEBP_ONESTEP(1);
1585
+ EIGEN_GEBP_ONESTEP(2);
1586
+ EIGEN_GEBP_ONESTEP(3);
1587
+ EIGEN_GEBP_ONESTEP(4);
1588
+ EIGEN_GEBP_ONESTEP(5);
1589
+ EIGEN_GEBP_ONESTEP(6);
1590
+ EIGEN_GEBP_ONESTEP(7);
1591
+
1592
+ blB += pk * 8 * RhsProgress;
1593
+ blA += pk * 3 * Traits::LhsProgress;
1594
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
1595
+ }
1596
+
1597
+ // process remaining peeled loop
1598
+ for (Index k = peeled_kc; k < depth; k++) {
1599
+ RhsPanel27 rhs_panel;
1600
+ RhsPacket T0;
1601
+ LhsPacket A2;
1602
+ EIGEN_GEBP_ONESTEP(0);
1603
+ blB += 8 * RhsProgress;
1604
+ blA += 3 * Traits::LhsProgress;
1605
+ }
1606
+
1607
+ #undef EIGEN_GEBP_ONESTEP
1608
+
1609
+ ResPacket R0, R1, R2;
1610
+ ResPacket alphav = pset1<ResPacket>(alpha);
1611
+
1612
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1613
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1614
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1615
+ traits.acc(C0, alphav, R0);
1616
+ traits.acc(C8, alphav, R1);
1617
+ traits.acc(C16, alphav, R2);
1618
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1619
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
1620
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
1621
+
1622
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1623
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1624
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1625
+ traits.acc(C1, alphav, R0);
1626
+ traits.acc(C9, alphav, R1);
1627
+ traits.acc(C17, alphav, R2);
1628
+ r1.storePacket(0 * Traits::ResPacketSize, R0);
1629
+ r1.storePacket(1 * Traits::ResPacketSize, R1);
1630
+ r1.storePacket(2 * Traits::ResPacketSize, R2);
1631
+
1632
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1633
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1634
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1635
+ traits.acc(C2, alphav, R0);
1636
+ traits.acc(C10, alphav, R1);
1637
+ traits.acc(C18, alphav, R2);
1638
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
1639
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
1640
+ r2.storePacket(2 * Traits::ResPacketSize, R2);
1641
+
1642
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1643
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1644
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1645
+ traits.acc(C3, alphav, R0);
1646
+ traits.acc(C11, alphav, R1);
1647
+ traits.acc(C19, alphav, R2);
1648
+ r3.storePacket(0 * Traits::ResPacketSize, R0);
1649
+ r3.storePacket(1 * Traits::ResPacketSize, R1);
1650
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
1651
+
1652
+ R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1653
+ R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1654
+ R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1655
+ traits.acc(C4, alphav, R0);
1656
+ traits.acc(C12, alphav, R1);
1657
+ traits.acc(C20, alphav, R2);
1658
+ r4.storePacket(0 * Traits::ResPacketSize, R0);
1659
+ r4.storePacket(1 * Traits::ResPacketSize, R1);
1660
+ r4.storePacket(2 * Traits::ResPacketSize, R2);
1661
+
1662
+ R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1663
+ R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1664
+ R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1665
+ traits.acc(C5, alphav, R0);
1666
+ traits.acc(C13, alphav, R1);
1667
+ traits.acc(C21, alphav, R2);
1668
+ r5.storePacket(0 * Traits::ResPacketSize, R0);
1669
+ r5.storePacket(1 * Traits::ResPacketSize, R1);
1670
+ r5.storePacket(2 * Traits::ResPacketSize, R2);
1671
+
1672
+ R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1673
+ R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1674
+ R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1675
+ traits.acc(C6, alphav, R0);
1676
+ traits.acc(C14, alphav, R1);
1677
+ traits.acc(C22, alphav, R2);
1678
+ r6.storePacket(0 * Traits::ResPacketSize, R0);
1679
+ r6.storePacket(1 * Traits::ResPacketSize, R1);
1680
+ r6.storePacket(2 * Traits::ResPacketSize, R2);
1681
+
1682
+ R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1683
+ R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1684
+ R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1685
+ traits.acc(C7, alphav, R0);
1686
+ traits.acc(C15, alphav, R1);
1687
+ traits.acc(C23, alphav, R2);
1688
+ r7.storePacket(0 * Traits::ResPacketSize, R0);
1689
+ r7.storePacket(1 * Traits::ResPacketSize, R1);
1690
+ r7.storePacket(2 * Traits::ResPacketSize, R2);
1691
+ }
1692
+ }
1693
+ }
1694
+ #endif
1695
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1696
+ for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
936
1697
  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
937
1698
  // stored into 3 x nr registers.
938
-
939
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1699
+
1700
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
940
1701
  prefetch(&blA[0]);
941
1702
 
942
1703
  // gets res block as register
943
- AccPacket C0, C1, C2, C3,
944
- C4, C5, C6, C7,
945
- C8, C9, C10, C11;
946
- traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
947
- traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
948
- traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1704
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
1705
+ traits.initAcc(C0);
1706
+ traits.initAcc(C1);
1707
+ traits.initAcc(C2);
1708
+ traits.initAcc(C3);
1709
+ traits.initAcc(C4);
1710
+ traits.initAcc(C5);
1711
+ traits.initAcc(C6);
1712
+ traits.initAcc(C7);
1713
+ traits.initAcc(C8);
1714
+ traits.initAcc(C9);
1715
+ traits.initAcc(C10);
1716
+ traits.initAcc(C11);
949
1717
 
950
1718
  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
951
1719
  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
@@ -958,43 +1726,54 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
958
1726
  r3.prefetch(0);
959
1727
 
960
1728
  // performs "inner" products
961
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1729
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
962
1730
  prefetch(&blB[0]);
963
1731
  LhsPacket A0, A1;
964
1732
 
965
- for(Index k=0; k<peeled_kc; k+=pk)
966
- {
1733
+ for (Index k = 0; k < peeled_kc; k += pk) {
967
1734
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
968
- RhsPacket B_0, T0;
1735
+ // 15 registers are taken (12 for acc, 3 for lhs).
1736
+ RhsPanel15 rhs_panel;
1737
+ RhsPacket T0;
969
1738
  LhsPacket A2;
970
-
971
- #define EIGEN_GEBP_ONESTEP(K) \
972
- do { \
973
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
974
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
975
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
976
- if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
977
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
978
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
979
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
980
- traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
981
- traits.madd(A0, B_0, C0, T0); \
982
- traits.madd(A1, B_0, C4, T0); \
983
- traits.madd(A2, B_0, C8, B_0); \
984
- traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
985
- traits.madd(A0, B_0, C1, T0); \
986
- traits.madd(A1, B_0, C5, T0); \
987
- traits.madd(A2, B_0, C9, B_0); \
988
- traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
989
- traits.madd(A0, B_0, C2, T0); \
990
- traits.madd(A1, B_0, C6, T0); \
991
- traits.madd(A2, B_0, C10, B_0); \
992
- traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
993
- traits.madd(A0, B_0, C3 , T0); \
994
- traits.madd(A1, B_0, C7, T0); \
995
- traits.madd(A2, B_0, C11, B_0); \
996
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
997
- } while(false)
1739
+ #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1740
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1741
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1742
+ // which is not good for pipelining
1743
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1744
+ #else
1745
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1746
+ #endif
1747
+ #define EIGEN_GEBP_ONESTEP(K) \
1748
+ do { \
1749
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1750
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1751
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1752
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1753
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1754
+ } /* Bug 953 */ \
1755
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1756
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1757
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1758
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1759
+ traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1760
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1761
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1762
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1763
+ traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1764
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1765
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1766
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1767
+ traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1768
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1769
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1770
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1771
+ traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1772
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1773
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1774
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1775
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1776
+ } while (false)
998
1777
 
999
1778
  internal::prefetch(blB);
1000
1779
  EIGEN_GEBP_ONESTEP(0);
@@ -1006,19 +1785,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1006
1785
  EIGEN_GEBP_ONESTEP(6);
1007
1786
  EIGEN_GEBP_ONESTEP(7);
1008
1787
 
1009
- blB += pk*4*RhsProgress;
1010
- blA += pk*3*Traits::LhsProgress;
1788
+ blB += pk * 4 * RhsProgress;
1789
+ blA += pk * 3 * Traits::LhsProgress;
1011
1790
 
1012
1791
  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1013
1792
  }
1014
1793
  // process remaining peeled loop
1015
- for(Index k=peeled_kc; k<depth; k++)
1016
- {
1017
- RhsPacket B_0, T0;
1794
+ for (Index k = peeled_kc; k < depth; k++) {
1795
+ RhsPanel15 rhs_panel;
1796
+ RhsPacket T0;
1018
1797
  LhsPacket A2;
1019
1798
  EIGEN_GEBP_ONESTEP(0);
1020
- blB += 4*RhsProgress;
1021
- blA += 3*Traits::LhsProgress;
1799
+ blB += 4 * RhsProgress;
1800
+ blA += 3 * Traits::LhsProgress;
1022
1801
  }
1023
1802
 
1024
1803
  #undef EIGEN_GEBP_ONESTEP
@@ -1026,9 +1805,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1026
1805
  ResPacket R0, R1, R2;
1027
1806
  ResPacket alphav = pset1<ResPacket>(alpha);
1028
1807
 
1029
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1030
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1031
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1808
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1809
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1810
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1032
1811
  traits.acc(C0, alphav, R0);
1033
1812
  traits.acc(C4, alphav, R1);
1034
1813
  traits.acc(C8, alphav, R2);
@@ -1036,9 +1815,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1036
1815
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1037
1816
  r0.storePacket(2 * Traits::ResPacketSize, R2);
1038
1817
 
1039
- R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1040
- R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1041
- R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1818
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1819
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1820
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1042
1821
  traits.acc(C1, alphav, R0);
1043
1822
  traits.acc(C5, alphav, R1);
1044
1823
  traits.acc(C9, alphav, R2);
@@ -1046,9 +1825,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1046
1825
  r1.storePacket(1 * Traits::ResPacketSize, R1);
1047
1826
  r1.storePacket(2 * Traits::ResPacketSize, R2);
1048
1827
 
1049
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1050
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1051
- R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1828
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1829
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1830
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1052
1831
  traits.acc(C2, alphav, R0);
1053
1832
  traits.acc(C6, alphav, R1);
1054
1833
  traits.acc(C10, alphav, R2);
@@ -1056,25 +1835,23 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1056
1835
  r2.storePacket(1 * Traits::ResPacketSize, R1);
1057
1836
  r2.storePacket(2 * Traits::ResPacketSize, R2);
1058
1837
 
1059
- R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1060
- R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1061
- R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1838
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1839
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1840
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1062
1841
  traits.acc(C3, alphav, R0);
1063
1842
  traits.acc(C7, alphav, R1);
1064
1843
  traits.acc(C11, alphav, R2);
1065
1844
  r3.storePacket(0 * Traits::ResPacketSize, R0);
1066
1845
  r3.storePacket(1 * Traits::ResPacketSize, R1);
1067
- r3.storePacket(2 * Traits::ResPacketSize, R2);
1068
- }
1846
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
1069
1847
  }
1848
+ }
1070
1849
 
1071
- // Deal with remaining columns of the rhs
1072
- for(Index j2=packet_cols4; j2<cols; j2++)
1073
- {
1074
- for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1075
- {
1850
+ // Deal with remaining columns of the rhs
1851
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
1852
+ for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1076
1853
  // One column at a time
1077
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1854
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
1078
1855
  prefetch(&blA[0]);
1079
1856
 
1080
1857
  // gets res block as register
@@ -1087,27 +1864,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1087
1864
  r0.prefetch(0);
1088
1865
 
1089
1866
  // performs "inner" products
1090
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1867
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1091
1868
  LhsPacket A0, A1, A2;
1092
-
1093
- for(Index k=0; k<peeled_kc; k+=pk)
1094
- {
1869
+
1870
+ for (Index k = 0; k < peeled_kc; k += pk) {
1095
1871
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1096
1872
  RhsPacket B_0;
1097
- #define EIGEN_GEBGP_ONESTEP(K) \
1098
- do { \
1099
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1100
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1101
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1102
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1103
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1104
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1105
- traits.madd(A0, B_0, C0, B_0); \
1106
- traits.madd(A1, B_0, C4, B_0); \
1107
- traits.madd(A2, B_0, C8, B_0); \
1108
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1109
- } while(false)
1110
-
1873
+ #define EIGEN_GEBGP_ONESTEP(K) \
1874
+ do { \
1875
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1876
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1877
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1878
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1879
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1880
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1881
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1882
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1883
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
1884
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1885
+ } while (false)
1886
+
1111
1887
  EIGEN_GEBGP_ONESTEP(0);
1112
1888
  EIGEN_GEBGP_ONESTEP(1);
1113
1889
  EIGEN_GEBGP_ONESTEP(2);
@@ -1117,66 +1893,239 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1117
1893
  EIGEN_GEBGP_ONESTEP(6);
1118
1894
  EIGEN_GEBGP_ONESTEP(7);
1119
1895
 
1120
- blB += pk*RhsProgress;
1121
- blA += pk*3*Traits::LhsProgress;
1896
+ blB += int(pk) * int(RhsProgress);
1897
+ blA += int(pk) * 3 * int(Traits::LhsProgress);
1122
1898
 
1123
1899
  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1124
1900
  }
1125
1901
 
1126
1902
  // process remaining peeled loop
1127
- for(Index k=peeled_kc; k<depth; k++)
1128
- {
1903
+ for (Index k = peeled_kc; k < depth; k++) {
1129
1904
  RhsPacket B_0;
1130
1905
  EIGEN_GEBGP_ONESTEP(0);
1131
1906
  blB += RhsProgress;
1132
- blA += 3*Traits::LhsProgress;
1907
+ blA += 3 * Traits::LhsProgress;
1133
1908
  }
1134
1909
  #undef EIGEN_GEBGP_ONESTEP
1135
1910
  ResPacket R0, R1, R2;
1136
1911
  ResPacket alphav = pset1<ResPacket>(alpha);
1137
1912
 
1138
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1139
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1140
- R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1913
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1914
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1915
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1141
1916
  traits.acc(C0, alphav, R0);
1142
1917
  traits.acc(C4, alphav, R1);
1143
1918
  traits.acc(C8, alphav, R2);
1144
1919
  r0.storePacket(0 * Traits::ResPacketSize, R0);
1145
1920
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1146
- r0.storePacket(2 * Traits::ResPacketSize, R2);
1147
- }
1921
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
1148
1922
  }
1149
1923
  }
1150
1924
  }
1925
+ }
1151
1926
 
1152
- //---------- Process 2 * LhsProgress rows at once ----------
1153
- if(mr>=2*Traits::LhsProgress)
1154
- {
1155
- const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1156
- // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1157
- // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1158
- // or because we are testing specific blocking sizes.
1159
- Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1927
+ //---------- Process 2 * LhsProgress rows at once ----------
1928
+ if (mr >= 2 * Traits::LhsProgress) {
1929
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1930
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1931
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1932
+ // guess), or because we are testing specific blocking sizes.
1933
+ Index actual_panel_rows =
1934
+ (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1935
+ (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
1936
+
1937
+ for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
1938
+ Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
1939
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1940
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1941
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1942
+ for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1943
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1944
+ prefetch(&blA[0]);
1945
+
1946
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
1947
+ traits.initAcc(C0);
1948
+ traits.initAcc(C1);
1949
+ traits.initAcc(C2);
1950
+ traits.initAcc(C3);
1951
+ traits.initAcc(C4);
1952
+ traits.initAcc(C5);
1953
+ traits.initAcc(C6);
1954
+ traits.initAcc(C7);
1955
+ traits.initAcc(C8);
1956
+ traits.initAcc(C9);
1957
+ traits.initAcc(C10);
1958
+ traits.initAcc(C11);
1959
+ traits.initAcc(C12);
1960
+ traits.initAcc(C13);
1961
+ traits.initAcc(C14);
1962
+ traits.initAcc(C15);
1963
+
1964
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1965
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1966
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1967
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1968
+ LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1969
+ LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1970
+ LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1971
+ LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1972
+ r0.prefetch(prefetch_res_offset);
1973
+ r1.prefetch(prefetch_res_offset);
1974
+ r2.prefetch(prefetch_res_offset);
1975
+ r3.prefetch(prefetch_res_offset);
1976
+ r4.prefetch(prefetch_res_offset);
1977
+ r5.prefetch(prefetch_res_offset);
1978
+ r6.prefetch(prefetch_res_offset);
1979
+ r7.prefetch(prefetch_res_offset);
1980
+
1981
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1982
+ prefetch(&blB[0]);
1983
+ LhsPacket A0, A1;
1984
+ for (Index k = 0; k < peeled_kc; k += pk) {
1985
+ RhsPacketx4 rhs_panel;
1986
+ RhsPacket T0;
1987
+ // NOTE: the begin/end asm comments below work around bug 935!
1988
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
1989
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
1990
+ #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
1991
+ #else
1992
+ #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
1993
+ #endif
1994
+ #define EIGEN_GEBGP_ONESTEP(K) \
1995
+ do { \
1996
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
1997
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1998
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1999
+ traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
2000
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2001
+ traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2002
+ traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2003
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2004
+ traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2005
+ traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2006
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2007
+ traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2008
+ traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2009
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2010
+ traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2011
+ traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2012
+ traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2013
+ traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2014
+ traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2015
+ traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2016
+ traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2017
+ traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2018
+ traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2019
+ traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2020
+ traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2021
+ traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2022
+ traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2023
+ EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2024
+ } while (false)
2025
+
2026
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
2027
+
2028
+ EIGEN_GEBGP_ONESTEP(0);
2029
+ EIGEN_GEBGP_ONESTEP(1);
2030
+ EIGEN_GEBGP_ONESTEP(2);
2031
+ EIGEN_GEBGP_ONESTEP(3);
2032
+ EIGEN_GEBGP_ONESTEP(4);
2033
+ EIGEN_GEBGP_ONESTEP(5);
2034
+ EIGEN_GEBGP_ONESTEP(6);
2035
+ EIGEN_GEBGP_ONESTEP(7);
2036
+
2037
+ blB += pk * 8 * RhsProgress;
2038
+ blA += pk * (2 * Traits::LhsProgress);
2039
+
2040
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
2041
+ }
2042
+ // process remaining peeled loop
2043
+ for (Index k = peeled_kc; k < depth; k++) {
2044
+ RhsPacketx4 rhs_panel;
2045
+ RhsPacket T0;
2046
+ EIGEN_GEBGP_ONESTEP(0);
2047
+ blB += 8 * RhsProgress;
2048
+ blA += 2 * Traits::LhsProgress;
2049
+ }
1160
2050
 
1161
- for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1162
- {
1163
- Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1164
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1165
- {
1166
- for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1167
- {
1168
-
2051
+ #undef EIGEN_GEBGP_ONESTEP
2052
+
2053
+ ResPacket R0, R1, R2, R3;
2054
+ ResPacket alphav = pset1<ResPacket>(alpha);
2055
+
2056
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2057
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2058
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2059
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2060
+ traits.acc(C0, alphav, R0);
2061
+ traits.acc(C8, alphav, R1);
2062
+ traits.acc(C1, alphav, R2);
2063
+ traits.acc(C9, alphav, R3);
2064
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
2065
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
2066
+ r1.storePacket(0 * Traits::ResPacketSize, R2);
2067
+ r1.storePacket(1 * Traits::ResPacketSize, R3);
2068
+
2069
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2070
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2071
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2072
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2073
+ traits.acc(C2, alphav, R0);
2074
+ traits.acc(C10, alphav, R1);
2075
+ traits.acc(C3, alphav, R2);
2076
+ traits.acc(C11, alphav, R3);
2077
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
2078
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
2079
+ r3.storePacket(0 * Traits::ResPacketSize, R2);
2080
+ r3.storePacket(1 * Traits::ResPacketSize, R3);
2081
+
2082
+ R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2083
+ R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2084
+ R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2085
+ R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2086
+ traits.acc(C4, alphav, R0);
2087
+ traits.acc(C12, alphav, R1);
2088
+ traits.acc(C5, alphav, R2);
2089
+ traits.acc(C13, alphav, R3);
2090
+ r4.storePacket(0 * Traits::ResPacketSize, R0);
2091
+ r4.storePacket(1 * Traits::ResPacketSize, R1);
2092
+ r5.storePacket(0 * Traits::ResPacketSize, R2);
2093
+ r5.storePacket(1 * Traits::ResPacketSize, R3);
2094
+
2095
+ R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2096
+ R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2097
+ R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2098
+ R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2099
+ traits.acc(C6, alphav, R0);
2100
+ traits.acc(C14, alphav, R1);
2101
+ traits.acc(C7, alphav, R2);
2102
+ traits.acc(C15, alphav, R3);
2103
+ r6.storePacket(0 * Traits::ResPacketSize, R0);
2104
+ r6.storePacket(1 * Traits::ResPacketSize, R1);
2105
+ r7.storePacket(0 * Traits::ResPacketSize, R2);
2106
+ r7.storePacket(1 * Traits::ResPacketSize, R3);
2107
+ }
2108
+ }
2109
+ }
2110
+ #endif
2111
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2112
+ for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1169
2113
  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1170
2114
  // stored into 2 x nr registers.
1171
-
1172
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2115
+
2116
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1173
2117
  prefetch(&blA[0]);
1174
2118
 
1175
2119
  // gets res block as register
1176
- AccPacket C0, C1, C2, C3,
1177
- C4, C5, C6, C7;
1178
- traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1179
- traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
2120
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
2121
+ traits.initAcc(C0);
2122
+ traits.initAcc(C1);
2123
+ traits.initAcc(C2);
2124
+ traits.initAcc(C3);
2125
+ traits.initAcc(C4);
2126
+ traits.initAcc(C5);
2127
+ traits.initAcc(C6);
2128
+ traits.initAcc(C7);
1180
2129
 
1181
2130
  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1182
2131
  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
@@ -1189,73 +2138,73 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1189
2138
  r3.prefetch(prefetch_res_offset);
1190
2139
 
1191
2140
  // performs "inner" products
1192
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
2141
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1193
2142
  prefetch(&blB[0]);
1194
2143
  LhsPacket A0, A1;
1195
2144
 
1196
- for(Index k=0; k<peeled_kc; k+=pk)
1197
- {
2145
+ for (Index k = 0; k < peeled_kc; k += pk) {
1198
2146
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1199
- RhsPacket B_0, B1, B2, B3, T0;
1200
-
1201
- // NOTE: the begin/end asm comments below work around bug 935!
1202
- // but they are not enough for gcc>=6 without FMA (bug 1637)
1203
- #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1204
- #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1205
- #else
1206
- #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1207
- #endif
1208
- #define EIGEN_GEBGP_ONESTEP(K) \
1209
- do { \
1210
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1211
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1212
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1213
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1214
- traits.madd(A0, B_0, C0, T0); \
1215
- traits.madd(A1, B_0, C4, B_0); \
1216
- traits.madd(A0, B1, C1, T0); \
1217
- traits.madd(A1, B1, C5, B1); \
1218
- traits.madd(A0, B2, C2, T0); \
1219
- traits.madd(A1, B2, C6, B2); \
1220
- traits.madd(A0, B3, C3, T0); \
1221
- traits.madd(A1, B3, C7, B3); \
1222
- EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1223
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1224
- } while(false)
1225
-
1226
- internal::prefetch(blB+(48+0));
2147
+ RhsPacketx4 rhs_panel;
2148
+ RhsPacket T0;
2149
+
2150
+ // NOTE: the begin/end asm comments below work around bug 935!
2151
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
2152
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2153
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
2154
+ #else
2155
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2156
+ #endif
2157
+ #define EIGEN_GEBGP_ONESTEP(K) \
2158
+ do { \
2159
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2160
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2161
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2162
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2163
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2164
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2165
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2166
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2167
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2168
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2169
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2170
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2171
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2172
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2173
+ } while (false)
2174
+
2175
+ internal::prefetch(blB + (48 + 0));
1227
2176
  EIGEN_GEBGP_ONESTEP(0);
1228
2177
  EIGEN_GEBGP_ONESTEP(1);
1229
2178
  EIGEN_GEBGP_ONESTEP(2);
1230
2179
  EIGEN_GEBGP_ONESTEP(3);
1231
- internal::prefetch(blB+(48+16));
2180
+ internal::prefetch(blB + (48 + 16));
1232
2181
  EIGEN_GEBGP_ONESTEP(4);
1233
2182
  EIGEN_GEBGP_ONESTEP(5);
1234
2183
  EIGEN_GEBGP_ONESTEP(6);
1235
2184
  EIGEN_GEBGP_ONESTEP(7);
1236
2185
 
1237
- blB += pk*4*RhsProgress;
1238
- blA += pk*(2*Traits::LhsProgress);
2186
+ blB += pk * 4 * RhsProgress;
2187
+ blA += pk * (2 * Traits::LhsProgress);
1239
2188
 
1240
2189
  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1241
2190
  }
1242
2191
  // process remaining peeled loop
1243
- for(Index k=peeled_kc; k<depth; k++)
1244
- {
1245
- RhsPacket B_0, B1, B2, B3, T0;
2192
+ for (Index k = peeled_kc; k < depth; k++) {
2193
+ RhsPacketx4 rhs_panel;
2194
+ RhsPacket T0;
1246
2195
  EIGEN_GEBGP_ONESTEP(0);
1247
- blB += 4*RhsProgress;
1248
- blA += 2*Traits::LhsProgress;
2196
+ blB += 4 * RhsProgress;
2197
+ blA += 2 * Traits::LhsProgress;
1249
2198
  }
1250
2199
  #undef EIGEN_GEBGP_ONESTEP
1251
2200
 
1252
2201
  ResPacket R0, R1, R2, R3;
1253
2202
  ResPacket alphav = pset1<ResPacket>(alpha);
1254
2203
 
1255
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1256
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1257
- R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1258
- R3 = r1.loadPacket(1 * Traits::ResPacketSize);
2204
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2205
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2206
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2207
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1259
2208
  traits.acc(C0, alphav, R0);
1260
2209
  traits.acc(C4, alphav, R1);
1261
2210
  traits.acc(C1, alphav, R2);
@@ -1265,28 +2214,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1265
2214
  r1.storePacket(0 * Traits::ResPacketSize, R2);
1266
2215
  r1.storePacket(1 * Traits::ResPacketSize, R3);
1267
2216
 
1268
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1269
- R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1270
- R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1271
- R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1272
- traits.acc(C2, alphav, R0);
1273
- traits.acc(C6, alphav, R1);
1274
- traits.acc(C3, alphav, R2);
1275
- traits.acc(C7, alphav, R3);
2217
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2218
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2219
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2220
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2221
+ traits.acc(C2, alphav, R0);
2222
+ traits.acc(C6, alphav, R1);
2223
+ traits.acc(C3, alphav, R2);
2224
+ traits.acc(C7, alphav, R3);
1276
2225
  r2.storePacket(0 * Traits::ResPacketSize, R0);
1277
2226
  r2.storePacket(1 * Traits::ResPacketSize, R1);
1278
2227
  r3.storePacket(0 * Traits::ResPacketSize, R2);
1279
2228
  r3.storePacket(1 * Traits::ResPacketSize, R3);
1280
- }
1281
2229
  }
1282
-
1283
- // Deal with remaining columns of the rhs
1284
- for(Index j2=packet_cols4; j2<cols; j2++)
1285
- {
1286
- for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1287
- {
2230
+ }
2231
+
2232
+ // Deal with remaining columns of the rhs
2233
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
2234
+ for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1288
2235
  // One column at a time
1289
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2236
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1290
2237
  prefetch(&blA[0]);
1291
2238
 
1292
2239
  // gets res block as register
@@ -1298,26 +2245,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1298
2245
  r0.prefetch(prefetch_res_offset);
1299
2246
 
1300
2247
  // performs "inner" products
1301
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2248
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1302
2249
  LhsPacket A0, A1;
1303
2250
 
1304
- for(Index k=0; k<peeled_kc; k+=pk)
1305
- {
2251
+ for (Index k = 0; k < peeled_kc; k += pk) {
1306
2252
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1307
2253
  RhsPacket B_0, B1;
1308
-
1309
- #define EIGEN_GEBGP_ONESTEP(K) \
1310
- do { \
1311
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1312
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1313
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1314
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1315
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1316
- traits.madd(A0, B_0, C0, B1); \
1317
- traits.madd(A1, B_0, C4, B_0); \
1318
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1319
- } while(false)
1320
-
2254
+
2255
+ #define EIGEN_GEBGP_ONESTEP(K) \
2256
+ do { \
2257
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2258
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2259
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2260
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2261
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
2262
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
2263
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
2264
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2265
+ } while (false)
2266
+
1321
2267
  EIGEN_GEBGP_ONESTEP(0);
1322
2268
  EIGEN_GEBGP_ONESTEP(1);
1323
2269
  EIGEN_GEBGP_ONESTEP(2);
@@ -1327,357 +2273,275 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1327
2273
  EIGEN_GEBGP_ONESTEP(6);
1328
2274
  EIGEN_GEBGP_ONESTEP(7);
1329
2275
 
1330
- blB += pk*RhsProgress;
1331
- blA += pk*2*Traits::LhsProgress;
2276
+ blB += int(pk) * int(RhsProgress);
2277
+ blA += int(pk) * 2 * int(Traits::LhsProgress);
1332
2278
 
1333
2279
  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1334
2280
  }
1335
2281
 
1336
2282
  // process remaining peeled loop
1337
- for(Index k=peeled_kc; k<depth; k++)
1338
- {
2283
+ for (Index k = peeled_kc; k < depth; k++) {
1339
2284
  RhsPacket B_0, B1;
1340
2285
  EIGEN_GEBGP_ONESTEP(0);
1341
2286
  blB += RhsProgress;
1342
- blA += 2*Traits::LhsProgress;
2287
+ blA += 2 * Traits::LhsProgress;
1343
2288
  }
1344
2289
  #undef EIGEN_GEBGP_ONESTEP
1345
2290
  ResPacket R0, R1;
1346
2291
  ResPacket alphav = pset1<ResPacket>(alpha);
1347
2292
 
1348
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1349
- R1 = r0.loadPacket(1 * Traits::ResPacketSize);
2293
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2294
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1350
2295
  traits.acc(C0, alphav, R0);
1351
2296
  traits.acc(C4, alphav, R1);
1352
2297
  r0.storePacket(0 * Traits::ResPacketSize, R0);
1353
2298
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1354
- }
1355
2299
  }
1356
2300
  }
1357
2301
  }
1358
- //---------- Process 1 * LhsProgress rows at once ----------
1359
- if(mr>=1*Traits::LhsProgress)
1360
- {
1361
- // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1362
- for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1363
- {
1364
- // loops on each largest micro vertical panel of rhs (depth * nr)
1365
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1366
- {
1367
- // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1368
- // stored into 1 x nr registers.
1369
-
1370
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
2302
+ }
2303
+ //---------- Process 1 * LhsProgress rows at once ----------
2304
+ if (mr >= 1 * Traits::LhsProgress) {
2305
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
2306
+ RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
2307
+ p;
2308
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2309
+ peeled_kc, pk, cols, depth, packet_cols4);
2310
+ }
2311
+ //---------- Process LhsProgressHalf rows at once ----------
2312
+ if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
2313
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
2314
+ LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
2315
+ p;
2316
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2317
+ peeled_kc, pk, cols, depth, packet_cols4);
2318
+ }
2319
+ //---------- Process LhsProgressQuarter rows at once ----------
2320
+ if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
2321
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
2322
+ AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter,
2323
+ QuarterTraits, LinearMapper, DataMapper>
2324
+ p;
2325
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
2326
+ prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2327
+ }
2328
+ //---------- Process remaining rows, 1 at once ----------
2329
+ if (peeled_mc_quarter < rows) {
2330
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2331
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
2332
+ // loop on each panel of the rhs
2333
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2334
+ // loop on each row of the lhs (1*LhsProgress x depth)
2335
+ for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2336
+ const LhsScalar* blA = &blockA[i * strideA + offsetA];
1371
2337
  prefetch(&blA[0]);
2338
+ // gets a 1 x 1 res block as registers
2339
+ ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
2340
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
2341
+ for (Index k = 0; k < depth; k++) {
2342
+ LhsScalar A0 = blA[k];
2343
+ RhsScalar B_0;
1372
2344
 
1373
- // gets res block as register
1374
- AccPacket C0, C1, C2, C3;
1375
- traits.initAcc(C0);
1376
- traits.initAcc(C1);
1377
- traits.initAcc(C2);
1378
- traits.initAcc(C3);
2345
+ B_0 = blB[0];
2346
+ C0 = cj.pmadd(A0, B_0, C0);
1379
2347
 
1380
- LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1381
- LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1382
- LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1383
- LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
2348
+ B_0 = blB[1];
2349
+ C1 = cj.pmadd(A0, B_0, C1);
1384
2350
 
1385
- r0.prefetch(prefetch_res_offset);
1386
- r1.prefetch(prefetch_res_offset);
1387
- r2.prefetch(prefetch_res_offset);
1388
- r3.prefetch(prefetch_res_offset);
2351
+ B_0 = blB[2];
2352
+ C2 = cj.pmadd(A0, B_0, C2);
1389
2353
 
1390
- // performs "inner" products
1391
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1392
- prefetch(&blB[0]);
1393
- LhsPacket A0;
2354
+ B_0 = blB[3];
2355
+ C3 = cj.pmadd(A0, B_0, C3);
1394
2356
 
1395
- for(Index k=0; k<peeled_kc; k+=pk)
1396
- {
1397
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1398
- RhsPacket B_0, B1, B2, B3;
1399
-
1400
- #define EIGEN_GEBGP_ONESTEP(K) \
1401
- do { \
1402
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1403
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1404
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1405
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1406
- traits.madd(A0, B_0, C0, B_0); \
1407
- traits.madd(A0, B1, C1, B1); \
1408
- traits.madd(A0, B2, C2, B2); \
1409
- traits.madd(A0, B3, C3, B3); \
1410
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1411
- } while(false)
1412
-
1413
- internal::prefetch(blB+(48+0));
1414
- EIGEN_GEBGP_ONESTEP(0);
1415
- EIGEN_GEBGP_ONESTEP(1);
1416
- EIGEN_GEBGP_ONESTEP(2);
1417
- EIGEN_GEBGP_ONESTEP(3);
1418
- internal::prefetch(blB+(48+16));
1419
- EIGEN_GEBGP_ONESTEP(4);
1420
- EIGEN_GEBGP_ONESTEP(5);
1421
- EIGEN_GEBGP_ONESTEP(6);
1422
- EIGEN_GEBGP_ONESTEP(7);
2357
+ B_0 = blB[4];
2358
+ C4 = cj.pmadd(A0, B_0, C4);
1423
2359
 
1424
- blB += pk*4*RhsProgress;
1425
- blA += pk*1*LhsProgress;
2360
+ B_0 = blB[5];
2361
+ C5 = cj.pmadd(A0, B_0, C5);
1426
2362
 
1427
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1428
- }
1429
- // process remaining peeled loop
1430
- for(Index k=peeled_kc; k<depth; k++)
1431
- {
1432
- RhsPacket B_0, B1, B2, B3;
1433
- EIGEN_GEBGP_ONESTEP(0);
1434
- blB += 4*RhsProgress;
1435
- blA += 1*LhsProgress;
1436
- }
1437
- #undef EIGEN_GEBGP_ONESTEP
2363
+ B_0 = blB[6];
2364
+ C6 = cj.pmadd(A0, B_0, C6);
1438
2365
 
1439
- ResPacket R0, R1;
1440
- ResPacket alphav = pset1<ResPacket>(alpha);
2366
+ B_0 = blB[7];
2367
+ C7 = cj.pmadd(A0, B_0, C7);
1441
2368
 
1442
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1443
- R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1444
- traits.acc(C0, alphav, R0);
1445
- traits.acc(C1, alphav, R1);
1446
- r0.storePacket(0 * Traits::ResPacketSize, R0);
1447
- r1.storePacket(0 * Traits::ResPacketSize, R1);
1448
-
1449
- R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1450
- R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1451
- traits.acc(C2, alphav, R0);
1452
- traits.acc(C3, alphav, R1);
1453
- r2.storePacket(0 * Traits::ResPacketSize, R0);
1454
- r3.storePacket(0 * Traits::ResPacketSize, R1);
2369
+ blB += 8;
2370
+ }
2371
+ res(i, j2 + 0) += alpha * C0;
2372
+ res(i, j2 + 1) += alpha * C1;
2373
+ res(i, j2 + 2) += alpha * C2;
2374
+ res(i, j2 + 3) += alpha * C3;
2375
+ res(i, j2 + 4) += alpha * C4;
2376
+ res(i, j2 + 5) += alpha * C5;
2377
+ res(i, j2 + 6) += alpha * C6;
2378
+ res(i, j2 + 7) += alpha * C7;
1455
2379
  }
2380
+ }
2381
+ }
2382
+ #endif
1456
2383
 
1457
- // Deal with remaining columns of the rhs
1458
- for(Index j2=packet_cols4; j2<cols; j2++)
1459
- {
1460
- // One column at a time
1461
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1462
- prefetch(&blA[0]);
1463
-
1464
- // gets res block as register
1465
- AccPacket C0;
1466
- traits.initAcc(C0);
2384
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2385
+ // loop on each row of the lhs (1*LhsProgress x depth)
2386
+ for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2387
+ const LhsScalar* blA = &blockA[i * strideA + offsetA];
2388
+ prefetch(&blA[0]);
2389
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2390
+
2391
+ // If LhsProgress is 8 or 16, it assumes that there is a
2392
+ // half or quarter packet, respectively, of the same size as
2393
+ // nr (which is currently 4) for the return type.
2394
+ const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
2395
+ const int SResPacketQuarterSize =
2396
+ unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
2397
+ // The following code assumes we can load SRhsPacket in such a way that
2398
+ // it multiplies blocks of 4 elements in SLhsPacket. This is not the
2399
+ // case for some customized kernels (i.e. NEON fp16). If the assumption
2400
+ // fails, drop down to the scalar path.
2401
+ constexpr bool kCanLoadSRhsQuad =
2402
+ (unpacket_traits<SLhsPacket>::size < 4) ||
2403
+ (unpacket_traits<SRhsPacket>::size % ((std::max<int>)(unpacket_traits<SLhsPacket>::size, 4) / 4)) == 0;
2404
+ if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
2405
+ (SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
2406
+ (SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
2407
+ SAccPacket C0, C1, C2, C3;
2408
+ straits.initAcc(C0);
2409
+ straits.initAcc(C1);
2410
+ straits.initAcc(C2);
2411
+ straits.initAcc(C3);
2412
+
2413
+ const Index spk = (std::max)(1, SwappedTraits::LhsProgress / 4);
2414
+ const Index endk = (depth / spk) * spk;
2415
+ const Index endk4 = (depth / (spk * 4)) * (spk * 4);
2416
+
2417
+ Index k = 0;
2418
+ for (; k < endk4; k += 4 * spk) {
2419
+ SLhsPacket A0, A1;
2420
+ SRhsPacket B_0, B_1;
2421
+
2422
+ straits.loadLhsUnaligned(blB + 0 * SwappedTraits::LhsProgress, A0);
2423
+ straits.loadLhsUnaligned(blB + 1 * SwappedTraits::LhsProgress, A1);
2424
+
2425
+ straits.loadRhsQuad(blA + 0 * spk, B_0);
2426
+ straits.loadRhsQuad(blA + 1 * spk, B_1);
2427
+ straits.madd(A0, B_0, C0, B_0, fix<0>);
2428
+ straits.madd(A1, B_1, C1, B_1, fix<0>);
2429
+
2430
+ straits.loadLhsUnaligned(blB + 2 * SwappedTraits::LhsProgress, A0);
2431
+ straits.loadLhsUnaligned(blB + 3 * SwappedTraits::LhsProgress, A1);
2432
+ straits.loadRhsQuad(blA + 2 * spk, B_0);
2433
+ straits.loadRhsQuad(blA + 3 * spk, B_1);
2434
+ straits.madd(A0, B_0, C2, B_0, fix<0>);
2435
+ straits.madd(A1, B_1, C3, B_1, fix<0>);
2436
+
2437
+ blB += 4 * SwappedTraits::LhsProgress;
2438
+ blA += 4 * spk;
2439
+ }
2440
+ C0 = padd(padd(C0, C1), padd(C2, C3));
2441
+ for (; k < endk; k += spk) {
2442
+ SLhsPacket A0;
2443
+ SRhsPacket B_0;
1467
2444
 
1468
- LinearMapper r0 = res.getLinearMapper(i, j2);
2445
+ straits.loadLhsUnaligned(blB, A0);
2446
+ straits.loadRhsQuad(blA, B_0);
2447
+ straits.madd(A0, B_0, C0, B_0, fix<0>);
1469
2448
 
1470
- // performs "inner" products
1471
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1472
- LhsPacket A0;
2449
+ blB += SwappedTraits::LhsProgress;
2450
+ blA += spk;
2451
+ }
2452
+ if (SwappedTraits::LhsProgress == 8) {
2453
+ // Special case where we have to first reduce the accumulation register C0
2454
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SResPacket>::half,
2455
+ SResPacket>
2456
+ SResPacketHalf;
2457
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SLhsPacket>::half,
2458
+ SLhsPacket>
2459
+ SLhsPacketHalf;
2460
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SRhsPacket>::half,
2461
+ SRhsPacket>
2462
+ SRhsPacketHalf;
2463
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SAccPacket>::half,
2464
+ SAccPacket>
2465
+ SAccPacketHalf;
2466
+
2467
+ SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2468
+ SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2469
+
2470
+ if (depth - endk > 0) {
2471
+ // We have to handle the last row of the rhs which corresponds to a half-packet
2472
+ SLhsPacketHalf a0;
2473
+ SRhsPacketHalf b0;
2474
+ straits.loadLhsUnaligned(blB, a0);
2475
+ straits.loadRhs(blA, b0);
2476
+ SAccPacketHalf c0 = predux_half_dowto4(C0);
2477
+ straits.madd(a0, b0, c0, b0, fix<0>);
2478
+ straits.acc(c0, alphav, R);
2479
+ } else {
2480
+ straits.acc(predux_half_dowto4(C0), alphav, R);
2481
+ }
2482
+ res.scatterPacket(i, j2, R);
2483
+ } else if (SwappedTraits::LhsProgress == 16) {
2484
+ // Special case where we have to first reduce the
2485
+ // accumulation register C0. We specialize the block in
2486
+ // template form, so that LhsProgress < 16 paths don't
2487
+ // fail to compile
2488
+ last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2489
+ p(res, straits, blA, blB, depth, endk, i, j2, alpha, C0);
2490
+ } else {
2491
+ SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2492
+ SResPacket alphav = pset1<SResPacket>(alpha);
2493
+ straits.acc(C0, alphav, R);
2494
+ res.scatterPacket(i, j2, R);
2495
+ }
2496
+ } else // scalar path
2497
+ {
2498
+ // get a 1 x 4 res block as registers
2499
+ ResScalar C0(0), C1(0), C2(0), C3(0);
1473
2500
 
1474
- for(Index k=0; k<peeled_kc; k+=pk)
1475
- {
1476
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1477
- RhsPacket B_0;
1478
-
1479
- #define EIGEN_GEBGP_ONESTEP(K) \
1480
- do { \
1481
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1482
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1483
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1484
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1485
- traits.madd(A0, B_0, C0, B_0); \
1486
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1487
- } while(false);
2501
+ for (Index k = 0; k < depth; k++) {
2502
+ LhsScalar A0;
2503
+ RhsScalar B_0, B_1;
1488
2504
 
1489
- EIGEN_GEBGP_ONESTEP(0);
1490
- EIGEN_GEBGP_ONESTEP(1);
1491
- EIGEN_GEBGP_ONESTEP(2);
1492
- EIGEN_GEBGP_ONESTEP(3);
1493
- EIGEN_GEBGP_ONESTEP(4);
1494
- EIGEN_GEBGP_ONESTEP(5);
1495
- EIGEN_GEBGP_ONESTEP(6);
1496
- EIGEN_GEBGP_ONESTEP(7);
2505
+ A0 = blA[k];
1497
2506
 
1498
- blB += pk*RhsProgress;
1499
- blA += pk*1*Traits::LhsProgress;
2507
+ B_0 = blB[0];
2508
+ B_1 = blB[1];
2509
+ C0 = cj.pmadd(A0, B_0, C0);
2510
+ C1 = cj.pmadd(A0, B_1, C1);
1500
2511
 
1501
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1502
- }
2512
+ B_0 = blB[2];
2513
+ B_1 = blB[3];
2514
+ C2 = cj.pmadd(A0, B_0, C2);
2515
+ C3 = cj.pmadd(A0, B_1, C3);
1503
2516
 
1504
- // process remaining peeled loop
1505
- for(Index k=peeled_kc; k<depth; k++)
1506
- {
1507
- RhsPacket B_0;
1508
- EIGEN_GEBGP_ONESTEP(0);
1509
- blB += RhsProgress;
1510
- blA += 1*Traits::LhsProgress;
2517
+ blB += 4;
1511
2518
  }
1512
- #undef EIGEN_GEBGP_ONESTEP
1513
- ResPacket R0;
1514
- ResPacket alphav = pset1<ResPacket>(alpha);
1515
- R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1516
- traits.acc(C0, alphav, R0);
1517
- r0.storePacket(0 * Traits::ResPacketSize, R0);
2519
+ res(i, j2 + 0) += alpha * C0;
2520
+ res(i, j2 + 1) += alpha * C1;
2521
+ res(i, j2 + 2) += alpha * C2;
2522
+ res(i, j2 + 3) += alpha * C3;
1518
2523
  }
1519
2524
  }
1520
2525
  }
1521
- //---------- Process remaining rows, 1 at once ----------
1522
- if(peeled_mc1<rows)
1523
- {
1524
- // loop on each panel of the rhs
1525
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1526
- {
1527
- // loop on each row of the lhs (1*LhsProgress x depth)
1528
- for(Index i=peeled_mc1; i<rows; i+=1)
1529
- {
1530
- const LhsScalar* blA = &blockA[i*strideA+offsetA];
1531
- prefetch(&blA[0]);
1532
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1533
-
1534
- // The following piece of code wont work for 512 bit registers
1535
- // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1536
- // as nr (which is currently 4) for the return type.
1537
- const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1538
- if ((SwappedTraits::LhsProgress % 4) == 0 &&
1539
- (SwappedTraits::LhsProgress <= 8) &&
1540
- (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1541
- {
1542
- SAccPacket C0, C1, C2, C3;
1543
- straits.initAcc(C0);
1544
- straits.initAcc(C1);
1545
- straits.initAcc(C2);
1546
- straits.initAcc(C3);
1547
-
1548
- const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1549
- const Index endk = (depth/spk)*spk;
1550
- const Index endk4 = (depth/(spk*4))*(spk*4);
1551
-
1552
- Index k=0;
1553
- for(; k<endk4; k+=4*spk)
1554
- {
1555
- SLhsPacket A0,A1;
1556
- SRhsPacket B_0,B_1;
1557
-
1558
- straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1559
- straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1560
-
1561
- straits.loadRhsQuad(blA+0*spk, B_0);
1562
- straits.loadRhsQuad(blA+1*spk, B_1);
1563
- straits.madd(A0,B_0,C0,B_0);
1564
- straits.madd(A1,B_1,C1,B_1);
1565
-
1566
- straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1567
- straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1568
- straits.loadRhsQuad(blA+2*spk, B_0);
1569
- straits.loadRhsQuad(blA+3*spk, B_1);
1570
- straits.madd(A0,B_0,C2,B_0);
1571
- straits.madd(A1,B_1,C3,B_1);
1572
-
1573
- blB += 4*SwappedTraits::LhsProgress;
1574
- blA += 4*spk;
1575
- }
1576
- C0 = padd(padd(C0,C1),padd(C2,C3));
1577
- for(; k<endk; k+=spk)
1578
- {
1579
- SLhsPacket A0;
1580
- SRhsPacket B_0;
1581
-
1582
- straits.loadLhsUnaligned(blB, A0);
1583
- straits.loadRhsQuad(blA, B_0);
1584
- straits.madd(A0,B_0,C0,B_0);
1585
-
1586
- blB += SwappedTraits::LhsProgress;
1587
- blA += spk;
1588
- }
1589
- if(SwappedTraits::LhsProgress==8)
1590
- {
1591
- // Special case where we have to first reduce the accumulation register C0
1592
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1593
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1594
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1595
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1596
-
1597
- SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1598
- SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1599
-
1600
- if(depth-endk>0)
1601
- {
1602
- // We have to handle the last row of the rhs which corresponds to a half-packet
1603
- SLhsPacketHalf a0;
1604
- SRhsPacketHalf b0;
1605
- straits.loadLhsUnaligned(blB, a0);
1606
- straits.loadRhs(blA, b0);
1607
- SAccPacketHalf c0 = predux_downto4(C0);
1608
- straits.madd(a0,b0,c0,b0);
1609
- straits.acc(c0, alphav, R);
1610
- }
1611
- else
1612
- {
1613
- straits.acc(predux_downto4(C0), alphav, R);
1614
- }
1615
- res.scatterPacket(i, j2, R);
1616
- }
1617
- else
1618
- {
1619
- SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1620
- SResPacket alphav = pset1<SResPacket>(alpha);
1621
- straits.acc(C0, alphav, R);
1622
- res.scatterPacket(i, j2, R);
1623
- }
1624
- }
1625
- else // scalar path
1626
- {
1627
- // get a 1 x 4 res block as registers
1628
- ResScalar C0(0), C1(0), C2(0), C3(0);
1629
-
1630
- for(Index k=0; k<depth; k++)
1631
- {
1632
- LhsScalar A0;
1633
- RhsScalar B_0, B_1;
1634
-
1635
- A0 = blA[k];
1636
-
1637
- B_0 = blB[0];
1638
- B_1 = blB[1];
1639
- CJMADD(cj,A0,B_0,C0, B_0);
1640
- CJMADD(cj,A0,B_1,C1, B_1);
1641
-
1642
- B_0 = blB[2];
1643
- B_1 = blB[3];
1644
- CJMADD(cj,A0,B_0,C2, B_0);
1645
- CJMADD(cj,A0,B_1,C3, B_1);
1646
-
1647
- blB += 4;
1648
- }
1649
- res(i, j2 + 0) += alpha * C0;
1650
- res(i, j2 + 1) += alpha * C1;
1651
- res(i, j2 + 2) += alpha * C2;
1652
- res(i, j2 + 3) += alpha * C3;
1653
- }
1654
- }
1655
- }
1656
- // remaining columns
1657
- for(Index j2=packet_cols4; j2<cols; j2++)
1658
- {
1659
- // loop on each row of the lhs (1*LhsProgress x depth)
1660
- for(Index i=peeled_mc1; i<rows; i+=1)
1661
- {
1662
- const LhsScalar* blA = &blockA[i*strideA+offsetA];
1663
- prefetch(&blA[0]);
1664
- // gets a 1 x 1 res block as registers
1665
- ResScalar C0(0);
1666
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1667
- for(Index k=0; k<depth; k++)
1668
- {
1669
- LhsScalar A0 = blA[k];
1670
- RhsScalar B_0 = blB[k];
1671
- CJMADD(cj, A0, B_0, C0, B_0);
1672
- }
1673
- res(i, j2) += alpha * C0;
2526
+ // remaining columns
2527
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
2528
+ // loop on each row of the lhs (1*LhsProgress x depth)
2529
+ for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2530
+ const LhsScalar* blA = &blockA[i * strideA + offsetA];
2531
+ prefetch(&blA[0]);
2532
+ // gets a 1 x 1 res block as registers
2533
+ ResScalar C0(0);
2534
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2535
+ for (Index k = 0; k < depth; k++) {
2536
+ LhsScalar A0 = blA[k];
2537
+ RhsScalar B_0 = blB[k];
2538
+ C0 = cj.pmadd(A0, B_0, C0);
1674
2539
  }
2540
+ res(i, j2) += alpha * C0;
1675
2541
  }
1676
2542
  }
1677
2543
  }
1678
-
1679
-
1680
- #undef CJMADD
2544
+ }
1681
2545
 
1682
2546
  // pack a block of the lhs
1683
2547
  // The traversal is as follow (mr==4):
@@ -1693,198 +2557,270 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1693
2557
  //
1694
2558
  // 32 33 34 35 ...
1695
2559
  // 36 36 38 39 ...
1696
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1697
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1698
- {
2560
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2561
+ bool PanelMode>
2562
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
1699
2563
  typedef typename DataMapper::LinearMapper LinearMapper;
1700
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2564
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
2565
+ Index offset = 0);
1701
2566
  };
1702
2567
 
1703
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1704
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1705
- ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1706
- {
1707
- typedef typename packet_traits<Scalar>::type Packet;
1708
- enum { PacketSize = packet_traits<Scalar>::size };
2568
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2569
+ bool PanelMode>
2570
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
2571
+ PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
2572
+ Index rows, Index stride, Index offset) {
2573
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2574
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2575
+ enum {
2576
+ PacketSize = unpacket_traits<Packet>::size,
2577
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2578
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2579
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2580
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
2581
+ };
1709
2582
 
1710
2583
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1711
2584
  EIGEN_UNUSED_VARIABLE(stride);
1712
2585
  EIGEN_UNUSED_VARIABLE(offset);
1713
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1714
- eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2586
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2587
+ eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
1715
2588
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1716
2589
  Index count = 0;
1717
2590
 
1718
- const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1719
- const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1720
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1721
- const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1722
- : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
2591
+ const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
2592
+ const Index peeled_mc2 =
2593
+ Pack1 >= 2 * PacketSize ? peeled_mc3 + ((rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
2594
+ const Index peeled_mc1 =
2595
+ Pack1 >= 1 * PacketSize ? peeled_mc2 + ((rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
2596
+ const Index peeled_mc_half =
2597
+ Pack1 >= HalfPacketSize ? peeled_mc1 + ((rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
2598
+ const Index peeled_mc_quarter = Pack1 >= QuarterPacketSize ? (rows / (QuarterPacketSize)) * (QuarterPacketSize) : 0;
2599
+ const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2600
+ const Index peeled_mc0 = Pack2 >= PacketSize ? peeled_mc_quarter
2601
+ : Pack2 > 1 && last_lhs_progress ? (rows / last_lhs_progress) * last_lhs_progress
2602
+ : 0;
1723
2603
 
1724
- Index i=0;
2604
+ Index i = 0;
1725
2605
 
1726
2606
  // Pack 3 packets
1727
- if(Pack1>=3*PacketSize)
1728
- {
1729
- for(; i<peeled_mc3; i+=3*PacketSize)
1730
- {
1731
- if(PanelMode) count += (3*PacketSize) * offset;
2607
+ if (Pack1 >= 3 * PacketSize) {
2608
+ for (; i < peeled_mc3; i += 3 * PacketSize) {
2609
+ if (PanelMode) count += (3 * PacketSize) * offset;
1732
2610
 
1733
- for(Index k=0; k<depth; k++)
1734
- {
2611
+ for (Index k = 0; k < depth; k++) {
1735
2612
  Packet A, B, C;
1736
- A = lhs.loadPacket(i+0*PacketSize, k);
1737
- B = lhs.loadPacket(i+1*PacketSize, k);
1738
- C = lhs.loadPacket(i+2*PacketSize, k);
1739
- pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1740
- pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1741
- pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2613
+ A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2614
+ B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
2615
+ C = lhs.template loadPacket<Packet>(i + 2 * PacketSize, k);
2616
+ pstore(blockA + count, cj.pconj(A));
2617
+ count += PacketSize;
2618
+ pstore(blockA + count, cj.pconj(B));
2619
+ count += PacketSize;
2620
+ pstore(blockA + count, cj.pconj(C));
2621
+ count += PacketSize;
1742
2622
  }
1743
- if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2623
+ if (PanelMode) count += (3 * PacketSize) * (stride - offset - depth);
1744
2624
  }
1745
2625
  }
1746
2626
  // Pack 2 packets
1747
- if(Pack1>=2*PacketSize)
1748
- {
1749
- for(; i<peeled_mc2; i+=2*PacketSize)
1750
- {
1751
- if(PanelMode) count += (2*PacketSize) * offset;
2627
+ if (Pack1 >= 2 * PacketSize) {
2628
+ for (; i < peeled_mc2; i += 2 * PacketSize) {
2629
+ if (PanelMode) count += (2 * PacketSize) * offset;
1752
2630
 
1753
- for(Index k=0; k<depth; k++)
1754
- {
2631
+ for (Index k = 0; k < depth; k++) {
1755
2632
  Packet A, B;
1756
- A = lhs.loadPacket(i+0*PacketSize, k);
1757
- B = lhs.loadPacket(i+1*PacketSize, k);
1758
- pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1759
- pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2633
+ A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2634
+ B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
2635
+ pstore(blockA + count, cj.pconj(A));
2636
+ count += PacketSize;
2637
+ pstore(blockA + count, cj.pconj(B));
2638
+ count += PacketSize;
1760
2639
  }
1761
- if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2640
+ if (PanelMode) count += (2 * PacketSize) * (stride - offset - depth);
1762
2641
  }
1763
2642
  }
1764
2643
  // Pack 1 packets
1765
- if(Pack1>=1*PacketSize)
1766
- {
1767
- for(; i<peeled_mc1; i+=1*PacketSize)
1768
- {
1769
- if(PanelMode) count += (1*PacketSize) * offset;
2644
+ if (Pack1 >= 1 * PacketSize) {
2645
+ for (; i < peeled_mc1; i += 1 * PacketSize) {
2646
+ if (PanelMode) count += (1 * PacketSize) * offset;
1770
2647
 
1771
- for(Index k=0; k<depth; k++)
1772
- {
2648
+ for (Index k = 0; k < depth; k++) {
1773
2649
  Packet A;
1774
- A = lhs.loadPacket(i+0*PacketSize, k);
1775
- pstore(blockA+count, cj.pconj(A));
1776
- count+=PacketSize;
2650
+ A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2651
+ pstore(blockA + count, cj.pconj(A));
2652
+ count += PacketSize;
1777
2653
  }
1778
- if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2654
+ if (PanelMode) count += (1 * PacketSize) * (stride - offset - depth);
1779
2655
  }
1780
2656
  }
1781
- // Pack scalars
1782
- if(Pack2<PacketSize && Pack2>1)
1783
- {
1784
- for(; i<peeled_mc0; i+=Pack2)
1785
- {
1786
- if(PanelMode) count += Pack2 * offset;
1787
-
1788
- for(Index k=0; k<depth; k++)
1789
- for(Index w=0; w<Pack2; w++)
1790
- blockA[count++] = cj(lhs(i+w, k));
1791
-
1792
- if(PanelMode) count += Pack2 * (stride-offset-depth);
2657
+ // Pack half packets
2658
+ if (HasHalf && Pack1 >= HalfPacketSize) {
2659
+ for (; i < peeled_mc_half; i += HalfPacketSize) {
2660
+ if (PanelMode) count += (HalfPacketSize)*offset;
2661
+
2662
+ for (Index k = 0; k < depth; k++) {
2663
+ HalfPacket A;
2664
+ A = lhs.template loadPacket<HalfPacket>(i + 0 * (HalfPacketSize), k);
2665
+ pstoreu(blockA + count, cj.pconj(A));
2666
+ count += HalfPacketSize;
2667
+ }
2668
+ if (PanelMode) count += (HalfPacketSize) * (stride - offset - depth);
2669
+ }
2670
+ }
2671
+ // Pack quarter packets
2672
+ if (HasQuarter && Pack1 >= QuarterPacketSize) {
2673
+ for (; i < peeled_mc_quarter; i += QuarterPacketSize) {
2674
+ if (PanelMode) count += (QuarterPacketSize)*offset;
2675
+
2676
+ for (Index k = 0; k < depth; k++) {
2677
+ QuarterPacket A;
2678
+ A = lhs.template loadPacket<QuarterPacket>(i + 0 * (QuarterPacketSize), k);
2679
+ pstoreu(blockA + count, cj.pconj(A));
2680
+ count += QuarterPacketSize;
2681
+ }
2682
+ if (PanelMode) count += (QuarterPacketSize) * (stride - offset - depth);
2683
+ }
2684
+ }
2685
+ // Pack2 may be *smaller* than PacketSize—that happens for
2686
+ // products like real * complex, where we have to go half the
2687
+ // progress on the lhs in order to duplicate those operands to
2688
+ // address both real & imaginary parts on the rhs. This portion will
2689
+ // pack those half ones until they match the number expected on the
2690
+ // last peeling loop at this point (for the rhs).
2691
+ if (Pack2 < PacketSize && Pack2 > 1) {
2692
+ for (; i < peeled_mc0; i += last_lhs_progress) {
2693
+ if (PanelMode) count += last_lhs_progress * offset;
2694
+
2695
+ for (Index k = 0; k < depth; k++)
2696
+ for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k));
2697
+
2698
+ if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
1793
2699
  }
1794
2700
  }
1795
- for(; i<rows; i++)
1796
- {
1797
- if(PanelMode) count += offset;
1798
- for(Index k=0; k<depth; k++)
1799
- blockA[count++] = cj(lhs(i, k));
1800
- if(PanelMode) count += (stride-offset-depth);
2701
+ // Pack scalars
2702
+ for (; i < rows; i++) {
2703
+ if (PanelMode) count += offset;
2704
+ for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
2705
+ if (PanelMode) count += (stride - offset - depth);
1801
2706
  }
1802
2707
  }
1803
2708
 
1804
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1805
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1806
- {
2709
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2710
+ bool PanelMode>
2711
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
1807
2712
  typedef typename DataMapper::LinearMapper LinearMapper;
1808
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2713
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
2714
+ Index offset = 0);
1809
2715
  };
1810
2716
 
1811
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1812
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1813
- ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1814
- {
1815
- typedef typename packet_traits<Scalar>::type Packet;
1816
- enum { PacketSize = packet_traits<Scalar>::size };
2717
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2718
+ bool PanelMode>
2719
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
2720
+ PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
2721
+ Index rows, Index stride, Index offset) {
2722
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
2723
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2724
+ enum {
2725
+ PacketSize = unpacket_traits<Packet>::size,
2726
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2727
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2728
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2729
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
2730
+ };
1817
2731
 
1818
2732
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1819
2733
  EIGEN_UNUSED_VARIABLE(stride);
1820
2734
  EIGEN_UNUSED_VARIABLE(offset);
1821
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2735
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
1822
2736
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1823
2737
  Index count = 0;
2738
+ bool gone_half = false, gone_quarter = false, gone_last = false;
1824
2739
 
1825
- // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1826
- // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1827
- // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1828
-
1829
- int pack = Pack1;
1830
2740
  Index i = 0;
1831
- while(pack>0)
1832
- {
1833
- Index remaining_rows = rows-i;
1834
- Index peeled_mc = i+(remaining_rows/pack)*pack;
1835
- for(; i<peeled_mc; i+=pack)
1836
- {
1837
- if(PanelMode) count += pack * offset;
1838
-
1839
- const Index peeled_k = (depth/PacketSize)*PacketSize;
1840
- Index k=0;
1841
- if(pack>=PacketSize)
1842
- {
1843
- for(; k<peeled_k; k+=PacketSize)
1844
- {
1845
- for (Index m = 0; m < pack; m += PacketSize)
1846
- {
1847
- PacketBlock<Packet> kernel;
1848
- for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1849
- ptranspose(kernel);
1850
- for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2741
+ Index pack = Pack1;
2742
+ Index psize = PacketSize;
2743
+ while (pack > 0) {
2744
+ Index remaining_rows = rows - i;
2745
+ Index peeled_mc = gone_last ? Pack2 > 1 ? (rows / pack) * pack : 0 : i + (remaining_rows / pack) * pack;
2746
+ Index starting_pos = i;
2747
+ for (; i < peeled_mc; i += pack) {
2748
+ if (PanelMode) count += pack * offset;
2749
+
2750
+ Index k = 0;
2751
+ if (pack >= psize && psize >= QuarterPacketSize) {
2752
+ const Index peeled_k = (depth / psize) * psize;
2753
+ for (; k < peeled_k; k += psize) {
2754
+ for (Index m = 0; m < pack; m += psize) {
2755
+ if (psize == PacketSize) {
2756
+ PacketBlock<Packet> kernel;
2757
+ for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i + p + m, k);
2758
+ ptranspose(kernel);
2759
+ for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel.packet[p]));
2760
+ } else if (HasHalf && psize == HalfPacketSize) {
2761
+ gone_half = true;
2762
+ PacketBlock<HalfPacket> kernel_half;
2763
+ for (Index p = 0; p < psize; ++p)
2764
+ kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i + p + m, k);
2765
+ ptranspose(kernel_half);
2766
+ for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_half.packet[p]));
2767
+ } else if (HasQuarter && psize == QuarterPacketSize) {
2768
+ gone_quarter = true;
2769
+ PacketBlock<QuarterPacket> kernel_quarter;
2770
+ for (Index p = 0; p < psize; ++p)
2771
+ kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i + p + m, k);
2772
+ ptranspose(kernel_quarter);
2773
+ for (Index p = 0; p < psize; ++p)
2774
+ pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_quarter.packet[p]));
2775
+ }
1851
2776
  }
1852
- count += PacketSize*pack;
2777
+ count += psize * pack;
1853
2778
  }
1854
2779
  }
1855
- for(; k<depth; k++)
1856
- {
1857
- Index w=0;
1858
- for(; w<pack-3; w+=4)
1859
- {
1860
- Scalar a(cj(lhs(i+w+0, k))),
1861
- b(cj(lhs(i+w+1, k))),
1862
- c(cj(lhs(i+w+2, k))),
1863
- d(cj(lhs(i+w+3, k)));
2780
+
2781
+ for (; k < depth; k++) {
2782
+ Index w = 0;
2783
+ for (; w < pack - 3; w += 4) {
2784
+ Scalar a(cj(lhs(i + w + 0, k))), b(cj(lhs(i + w + 1, k))), c(cj(lhs(i + w + 2, k))), d(cj(lhs(i + w + 3, k)));
1864
2785
  blockA[count++] = a;
1865
2786
  blockA[count++] = b;
1866
2787
  blockA[count++] = c;
1867
2788
  blockA[count++] = d;
1868
2789
  }
1869
- if(pack%4)
1870
- for(;w<pack;++w)
1871
- blockA[count++] = cj(lhs(i+w, k));
2790
+ if (pack % 4)
2791
+ for (; w < pack; ++w) blockA[count++] = cj(lhs(i + w, k));
1872
2792
  }
1873
2793
 
1874
- if(PanelMode) count += pack * (stride-offset-depth);
2794
+ if (PanelMode) count += pack * (stride - offset - depth);
1875
2795
  }
1876
2796
 
1877
- pack -= PacketSize;
1878
- if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1879
- pack = Pack2;
2797
+ pack -= psize;
2798
+ Index left = rows - i;
2799
+ if (pack <= 0) {
2800
+ if (!gone_last && (starting_pos == i || left >= psize / 2 || left >= psize / 4) &&
2801
+ ((psize / 2 == HalfPacketSize && HasHalf && !gone_half) ||
2802
+ (psize / 2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2803
+ psize /= 2;
2804
+ pack = psize;
2805
+ continue;
2806
+ }
2807
+ // Pack2 may be *smaller* than PacketSize—that happens for
2808
+ // products like real * complex, where we have to go half the
2809
+ // progress on the lhs in order to duplicate those operands to
2810
+ // address both real & imaginary parts on the rhs. This portion will
2811
+ // pack those half ones until they match the number expected on the
2812
+ // last peeling loop at this point (for the rhs).
2813
+ if (Pack2 < PacketSize && !gone_last) {
2814
+ gone_last = true;
2815
+ psize = pack = left & ~1;
2816
+ }
2817
+ }
1880
2818
  }
1881
2819
 
1882
- for(; i<rows; i++)
1883
- {
1884
- if(PanelMode) count += offset;
1885
- for(Index k=0; k<depth; k++)
1886
- blockA[count++] = cj(lhs(i, k));
1887
- if(PanelMode) count += (stride-offset-depth);
2820
+ for (; i < rows; i++) {
2821
+ if (PanelMode) count += offset;
2822
+ for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
2823
+ if (PanelMode) count += (stride - offset - depth);
1888
2824
  }
1889
2825
  }
1890
2826
 
@@ -1895,263 +2831,323 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
1895
2831
  // 4 5 6 7 16 17 18 19 25 28
1896
2832
  // 8 9 10 11 20 21 22 23 26 29
1897
2833
  // . . . . . . . . . .
1898
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1899
- struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1900
- {
2834
+ template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2835
+ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
1901
2836
  typedef typename packet_traits<Scalar>::type Packet;
1902
2837
  typedef typename DataMapper::LinearMapper LinearMapper;
1903
2838
  enum { PacketSize = packet_traits<Scalar>::size };
1904
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2839
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
2840
+ Index offset = 0);
1905
2841
  };
1906
2842
 
1907
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1908
- EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1909
- ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
1910
- {
2843
+ template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2844
+ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
2845
+ Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
1911
2846
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
1912
2847
  EIGEN_UNUSED_VARIABLE(stride);
1913
2848
  EIGEN_UNUSED_VARIABLE(offset);
1914
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2849
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
1915
2850
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1916
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1917
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2851
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
2852
+ Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
1918
2853
  Index count = 0;
1919
- const Index peeled_k = (depth/PacketSize)*PacketSize;
1920
- // if(nr>=8)
1921
- // {
1922
- // for(Index j2=0; j2<packet_cols8; j2+=8)
1923
- // {
1924
- // // skip what we have before
1925
- // if(PanelMode) count += 8 * offset;
1926
- // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
1927
- // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
1928
- // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
1929
- // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
1930
- // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
1931
- // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
1932
- // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1933
- // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1934
- // Index k=0;
1935
- // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
1936
- // {
1937
- // for(; k<peeled_k; k+=PacketSize) {
1938
- // PacketBlock<Packet> kernel;
1939
- // for (int p = 0; p < PacketSize; ++p) {
1940
- // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
1941
- // }
1942
- // ptranspose(kernel);
1943
- // for (int p = 0; p < PacketSize; ++p) {
1944
- // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
1945
- // count+=PacketSize;
1946
- // }
1947
- // }
1948
- // }
1949
- // for(; k<depth; k++)
1950
- // {
1951
- // blockB[count+0] = cj(b0[k]);
1952
- // blockB[count+1] = cj(b1[k]);
1953
- // blockB[count+2] = cj(b2[k]);
1954
- // blockB[count+3] = cj(b3[k]);
1955
- // blockB[count+4] = cj(b4[k]);
1956
- // blockB[count+5] = cj(b5[k]);
1957
- // blockB[count+6] = cj(b6[k]);
1958
- // blockB[count+7] = cj(b7[k]);
1959
- // count += 8;
1960
- // }
1961
- // // skip what we have after
1962
- // if(PanelMode) count += 8 * (stride-offset-depth);
1963
- // }
1964
- // }
1965
-
1966
- if(nr>=4)
1967
- {
1968
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1969
- {
2854
+ const Index peeled_k = (depth / PacketSize) * PacketSize;
2855
+
2856
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2857
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
2858
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1970
2859
  // skip what we have before
1971
- if(PanelMode) count += 4 * offset;
2860
+ if (PanelMode) count += 8 * offset;
1972
2861
  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1973
2862
  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1974
2863
  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1975
2864
  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2865
+ const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
2866
+ const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
2867
+ const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
2868
+ const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
2869
+ Index k = 0;
2870
+ if (PacketSize % 2 == 0 && PacketSize <= 8) // 2 4 8
2871
+ {
2872
+ for (; k < peeled_k; k += PacketSize) {
2873
+ if (PacketSize == 2) {
2874
+ PacketBlock<Packet, PacketSize == 2 ? 2 : PacketSize> kernel0, kernel1, kernel2, kernel3;
2875
+ kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2876
+ kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2877
+ kernel1.packet[0 % PacketSize] = dm2.template loadPacket<Packet>(k);
2878
+ kernel1.packet[1 % PacketSize] = dm3.template loadPacket<Packet>(k);
2879
+ kernel2.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
2880
+ kernel2.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
2881
+ kernel3.packet[0 % PacketSize] = dm6.template loadPacket<Packet>(k);
2882
+ kernel3.packet[1 % PacketSize] = dm7.template loadPacket<Packet>(k);
2883
+ ptranspose(kernel0);
2884
+ ptranspose(kernel1);
2885
+ ptranspose(kernel2);
2886
+ ptranspose(kernel3);
2887
+
2888
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2889
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
2890
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
2891
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
2892
+
2893
+ pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2894
+ pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
2895
+ pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
2896
+ pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
2897
+ count += 8 * PacketSize;
2898
+ } else if (PacketSize == 4) {
2899
+ PacketBlock<Packet, PacketSize == 4 ? 4 : PacketSize> kernel0, kernel1;
2900
+
2901
+ kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2902
+ kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2903
+ kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2904
+ kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2905
+ kernel1.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
2906
+ kernel1.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
2907
+ kernel1.packet[2 % PacketSize] = dm6.template loadPacket<Packet>(k);
2908
+ kernel1.packet[3 % PacketSize] = dm7.template loadPacket<Packet>(k);
2909
+ ptranspose(kernel0);
2910
+ ptranspose(kernel1);
2911
+
2912
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2913
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
2914
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2915
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
2916
+ pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
2917
+ pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[2 % PacketSize]));
2918
+ pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
2919
+ pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel1.packet[3 % PacketSize]));
2920
+ count += 8 * PacketSize;
2921
+ } else if (PacketSize == 8) {
2922
+ PacketBlock<Packet, PacketSize == 8 ? 8 : PacketSize> kernel0;
2923
+
2924
+ kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2925
+ kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2926
+ kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2927
+ kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2928
+ kernel0.packet[4 % PacketSize] = dm4.template loadPacket<Packet>(k);
2929
+ kernel0.packet[5 % PacketSize] = dm5.template loadPacket<Packet>(k);
2930
+ kernel0.packet[6 % PacketSize] = dm6.template loadPacket<Packet>(k);
2931
+ kernel0.packet[7 % PacketSize] = dm7.template loadPacket<Packet>(k);
2932
+ ptranspose(kernel0);
2933
+
2934
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2935
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2936
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
2937
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
2938
+ pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[4 % PacketSize]));
2939
+ pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel0.packet[5 % PacketSize]));
2940
+ pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[6 % PacketSize]));
2941
+ pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel0.packet[7 % PacketSize]));
2942
+ count += 8 * PacketSize;
2943
+ }
2944
+ }
2945
+ }
1976
2946
 
1977
- Index k=0;
1978
- if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2947
+ for (; k < depth; k++) {
2948
+ blockB[count + 0] = cj(dm0(k));
2949
+ blockB[count + 1] = cj(dm1(k));
2950
+ blockB[count + 2] = cj(dm2(k));
2951
+ blockB[count + 3] = cj(dm3(k));
2952
+ blockB[count + 4] = cj(dm4(k));
2953
+ blockB[count + 5] = cj(dm5(k));
2954
+ blockB[count + 6] = cj(dm6(k));
2955
+ blockB[count + 7] = cj(dm7(k));
2956
+ count += 8;
2957
+ }
2958
+ // skip what we have after
2959
+ if (PanelMode) count += 8 * (stride - offset - depth);
2960
+ }
2961
+ }
2962
+ #endif
2963
+
2964
+ EIGEN_IF_CONSTEXPR(nr >= 4) {
2965
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2966
+ // skip what we have before
2967
+ if (PanelMode) count += 4 * offset;
2968
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2969
+ const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2970
+ const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2971
+ const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2972
+
2973
+ Index k = 0;
2974
+ if ((PacketSize % 4) == 0) // TODO enable vectorized transposition for PacketSize==2 ??
1979
2975
  {
1980
- for(; k<peeled_k; k+=PacketSize) {
1981
- PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1982
- kernel.packet[0] = dm0.loadPacket(k);
1983
- kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1984
- kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1985
- kernel.packet[3%PacketSize] = dm3.loadPacket(k);
2976
+ for (; k < peeled_k; k += PacketSize) {
2977
+ PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
2978
+ kernel.packet[0] = dm0.template loadPacket<Packet>(k);
2979
+ kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2980
+ kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2981
+ kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
1986
2982
  ptranspose(kernel);
1987
- pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1988
- pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1989
- pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1990
- pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1991
- count+=4*PacketSize;
2983
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
2984
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
2985
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
2986
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
2987
+ count += 4 * PacketSize;
1992
2988
  }
1993
2989
  }
1994
- for(; k<depth; k++)
1995
- {
1996
- blockB[count+0] = cj(dm0(k));
1997
- blockB[count+1] = cj(dm1(k));
1998
- blockB[count+2] = cj(dm2(k));
1999
- blockB[count+3] = cj(dm3(k));
2990
+ for (; k < depth; k++) {
2991
+ blockB[count + 0] = cj(dm0(k));
2992
+ blockB[count + 1] = cj(dm1(k));
2993
+ blockB[count + 2] = cj(dm2(k));
2994
+ blockB[count + 3] = cj(dm3(k));
2000
2995
  count += 4;
2001
2996
  }
2002
2997
  // skip what we have after
2003
- if(PanelMode) count += 4 * (stride-offset-depth);
2998
+ if (PanelMode) count += 4 * (stride - offset - depth);
2004
2999
  }
2005
3000
  }
2006
3001
 
2007
3002
  // copy the remaining columns one at a time (nr==1)
2008
- for(Index j2=packet_cols4; j2<cols; ++j2)
2009
- {
2010
- if(PanelMode) count += offset;
3003
+ for (Index j2 = packet_cols4; j2 < cols; ++j2) {
3004
+ if (PanelMode) count += offset;
2011
3005
  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2012
- for(Index k=0; k<depth; k++)
2013
- {
3006
+ for (Index k = 0; k < depth; k++) {
2014
3007
  blockB[count] = cj(dm0(k));
2015
3008
  count += 1;
2016
3009
  }
2017
- if(PanelMode) count += (stride-offset-depth);
3010
+ if (PanelMode) count += (stride - offset - depth);
2018
3011
  }
2019
3012
  }
2020
3013
 
2021
3014
  // this version is optimized for row major matrices
2022
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2023
- struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2024
- {
3015
+ template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3016
+ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
2025
3017
  typedef typename packet_traits<Scalar>::type Packet;
3018
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
3019
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2026
3020
  typedef typename DataMapper::LinearMapper LinearMapper;
2027
- enum { PacketSize = packet_traits<Scalar>::size };
2028
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2029
- };
2030
-
2031
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2032
- EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2033
- ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2034
- {
2035
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2036
- EIGEN_UNUSED_VARIABLE(stride);
2037
- EIGEN_UNUSED_VARIABLE(offset);
2038
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2039
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2040
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2041
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2042
- Index count = 0;
3021
+ enum {
3022
+ PacketSize = packet_traits<Scalar>::size,
3023
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
3024
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size
3025
+ };
3026
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
3027
+ Index offset = 0) {
3028
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
3029
+ EIGEN_UNUSED_VARIABLE(stride);
3030
+ EIGEN_UNUSED_VARIABLE(offset);
3031
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
3032
+ const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
3033
+ const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
3034
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
3035
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
3036
+ Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
3037
+ Index count = 0;
3038
+
3039
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
3040
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
3041
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
3042
+ // skip what we have before
3043
+ if (PanelMode) count += 8 * offset;
3044
+ for (Index k = 0; k < depth; k++) {
3045
+ if (PacketSize == 8) {
3046
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
3047
+ pstoreu(blockB + count, cj.pconj(A));
3048
+ count += PacketSize;
3049
+ } else if (PacketSize == 4) {
3050
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
3051
+ Packet B = rhs.template loadPacket<Packet>(k, j2 + 4);
3052
+ pstoreu(blockB + count, cj.pconj(A));
3053
+ pstoreu(blockB + count + PacketSize, cj.pconj(B));
3054
+ count += 2 * PacketSize;
3055
+ } else {
3056
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3057
+ blockB[count + 0] = cj(dm0(0));
3058
+ blockB[count + 1] = cj(dm0(1));
3059
+ blockB[count + 2] = cj(dm0(2));
3060
+ blockB[count + 3] = cj(dm0(3));
3061
+ blockB[count + 4] = cj(dm0(4));
3062
+ blockB[count + 5] = cj(dm0(5));
3063
+ blockB[count + 6] = cj(dm0(6));
3064
+ blockB[count + 7] = cj(dm0(7));
3065
+ count += 8;
3066
+ }
3067
+ }
3068
+ // skip what we have after
3069
+ if (PanelMode) count += 8 * (stride - offset - depth);
3070
+ }
3071
+ }
3072
+ #endif
2043
3073
 
2044
- // if(nr>=8)
2045
- // {
2046
- // for(Index j2=0; j2<packet_cols8; j2+=8)
2047
- // {
2048
- // // skip what we have before
2049
- // if(PanelMode) count += 8 * offset;
2050
- // for(Index k=0; k<depth; k++)
2051
- // {
2052
- // if (PacketSize==8) {
2053
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2054
- // pstoreu(blockB+count, cj.pconj(A));
2055
- // } else if (PacketSize==4) {
2056
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2057
- // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2058
- // pstoreu(blockB+count, cj.pconj(A));
2059
- // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2060
- // } else {
2061
- // const Scalar* b0 = &rhs[k*rhsStride + j2];
2062
- // blockB[count+0] = cj(b0[0]);
2063
- // blockB[count+1] = cj(b0[1]);
2064
- // blockB[count+2] = cj(b0[2]);
2065
- // blockB[count+3] = cj(b0[3]);
2066
- // blockB[count+4] = cj(b0[4]);
2067
- // blockB[count+5] = cj(b0[5]);
2068
- // blockB[count+6] = cj(b0[6]);
2069
- // blockB[count+7] = cj(b0[7]);
2070
- // }
2071
- // count += 8;
2072
- // }
2073
- // // skip what we have after
2074
- // if(PanelMode) count += 8 * (stride-offset-depth);
2075
- // }
2076
- // }
2077
- if(nr>=4)
2078
- {
2079
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2080
- {
2081
- // skip what we have before
2082
- if(PanelMode) count += 4 * offset;
2083
- for(Index k=0; k<depth; k++)
2084
- {
2085
- if (PacketSize==4) {
2086
- Packet A = rhs.loadPacket(k, j2);
2087
- pstoreu(blockB+count, cj.pconj(A));
2088
- count += PacketSize;
2089
- } else {
2090
- const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2091
- blockB[count+0] = cj(dm0(0));
2092
- blockB[count+1] = cj(dm0(1));
2093
- blockB[count+2] = cj(dm0(2));
2094
- blockB[count+3] = cj(dm0(3));
2095
- count += 4;
3074
+ if (nr >= 4) {
3075
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
3076
+ // skip what we have before
3077
+ if (PanelMode) count += 4 * offset;
3078
+ for (Index k = 0; k < depth; k++) {
3079
+ if (PacketSize == 4) {
3080
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
3081
+ pstoreu(blockB + count, cj.pconj(A));
3082
+ count += PacketSize;
3083
+ } else if (HasHalf && HalfPacketSize == 4) {
3084
+ HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
3085
+ pstoreu(blockB + count, cj.pconj(A));
3086
+ count += HalfPacketSize;
3087
+ } else if (HasQuarter && QuarterPacketSize == 4) {
3088
+ QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
3089
+ pstoreu(blockB + count, cj.pconj(A));
3090
+ count += QuarterPacketSize;
3091
+ } else {
3092
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3093
+ blockB[count + 0] = cj(dm0(0));
3094
+ blockB[count + 1] = cj(dm0(1));
3095
+ blockB[count + 2] = cj(dm0(2));
3096
+ blockB[count + 3] = cj(dm0(3));
3097
+ count += 4;
3098
+ }
2096
3099
  }
3100
+ // skip what we have after
3101
+ if (PanelMode) count += 4 * (stride - offset - depth);
2097
3102
  }
2098
- // skip what we have after
2099
- if(PanelMode) count += 4 * (stride-offset-depth);
2100
3103
  }
2101
- }
2102
- // copy the remaining columns one at a time (nr==1)
2103
- for(Index j2=packet_cols4; j2<cols; ++j2)
2104
- {
2105
- if(PanelMode) count += offset;
2106
- for(Index k=0; k<depth; k++)
2107
- {
2108
- blockB[count] = cj(rhs(k, j2));
2109
- count += 1;
3104
+ // copy the remaining columns one at a time (nr==1)
3105
+ for (Index j2 = packet_cols4; j2 < cols; ++j2) {
3106
+ if (PanelMode) count += offset;
3107
+ for (Index k = 0; k < depth; k++) {
3108
+ blockB[count] = cj(rhs(k, j2));
3109
+ count += 1;
3110
+ }
3111
+ if (PanelMode) count += stride - offset - depth;
2110
3112
  }
2111
- if(PanelMode) count += stride-offset-depth;
2112
3113
  }
2113
- }
3114
+ };
2114
3115
 
2115
- } // end namespace internal
3116
+ } // end namespace internal
2116
3117
 
2117
3118
  /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
2118
- * \sa setCpuCacheSize */
2119
- inline std::ptrdiff_t l1CacheSize()
2120
- {
3119
+ * \sa setCpuCacheSize */
3120
+ inline std::ptrdiff_t l1CacheSize() {
2121
3121
  std::ptrdiff_t l1, l2, l3;
2122
3122
  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2123
3123
  return l1;
2124
3124
  }
2125
3125
 
2126
3126
  /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
2127
- * \sa setCpuCacheSize */
2128
- inline std::ptrdiff_t l2CacheSize()
2129
- {
3127
+ * \sa setCpuCacheSize */
3128
+ inline std::ptrdiff_t l2CacheSize() {
2130
3129
  std::ptrdiff_t l1, l2, l3;
2131
3130
  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2132
3131
  return l2;
2133
3132
  }
2134
3133
 
2135
- /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
2136
- rs.
2137
- * \sa setCpuCacheSize */
2138
- inline std::ptrdiff_t l3CacheSize()
2139
- {
3134
+ /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
3135
+ * \sa setCpuCacheSize */
3136
+ inline std::ptrdiff_t l3CacheSize() {
2140
3137
  std::ptrdiff_t l1, l2, l3;
2141
3138
  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2142
3139
  return l3;
2143
3140
  }
2144
3141
 
2145
3142
  /** Set the cpu L1 and L2 cache sizes (in bytes).
2146
- * These values are use to adjust the size of the blocks
2147
- * for the algorithms working per blocks.
2148
- *
2149
- * \sa computeProductBlockingSizes */
2150
- inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2151
- {
3143
+ * These values are use to adjust the size of the blocks
3144
+ * for the algorithms working per blocks.
3145
+ *
3146
+ * \sa computeProductBlockingSizes */
3147
+ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) {
2152
3148
  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2153
3149
  }
2154
3150
 
2155
- } // end namespace Eigen
3151
+ } // end namespace Eigen
2156
3152
 
2157
- #endif // EIGEN_GENERAL_BLOCK_PANEL_H
3153
+ #endif // EIGEN_GENERAL_BLOCK_PANEL_H