@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -10,57 +10,57 @@
10
10
  #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11
11
  #define EIGEN_GENERAL_BLOCK_PANEL_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../InternalHeaderCheck.h"
13
15
 
14
16
  namespace Eigen {
15
17
 
16
18
  namespace internal {
17
19
 
18
- enum GEBPPacketSizeType {
19
- GEBPPacketFull = 0,
20
- GEBPPacketHalf,
21
- GEBPPacketQuarter
22
- };
20
+ enum GEBPPacketSizeType { GEBPPacketFull = 0, GEBPPacketHalf, GEBPPacketQuarter };
23
21
 
24
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
22
+ template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_ = false, bool ConjRhs_ = false,
23
+ int Arch = Architecture::Target, int PacketSize_ = GEBPPacketFull>
25
24
  class gebp_traits;
26
25
 
27
-
28
26
  /** \internal \returns b if a<=0, and returns a otherwise. */
29
- inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
30
- {
31
- return a<=0 ? b : a;
32
- }
27
+ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { return a <= 0 ? b : a; }
33
28
 
34
29
  #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35
30
  #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36
31
  #else
37
32
  #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38
- #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
33
+ #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39
34
 
40
35
  #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41
36
  #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42
37
  #else
43
38
  #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44
- #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
39
+ #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45
40
 
46
41
  #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47
42
  #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
48
43
  #else
49
44
  #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50
- #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51
-
45
+ #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
46
+
52
47
  #if EIGEN_ARCH_i386_OR_x86_64
53
- const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54
- const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55
- const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
48
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32 * 1024);
49
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256 * 1024);
50
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2 * 1024 * 1024);
56
51
  #elif EIGEN_ARCH_PPC
57
- const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58
- const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59
- const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
52
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
53
+ #ifdef _ARCH_PWR10
54
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2 * 1024 * 1024);
55
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 1024 * 1024);
56
+ #else
57
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
58
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
59
+ #endif
60
60
  #else
61
- const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62
- const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63
- const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
61
+ const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
62
+ const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
63
+ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 * 1024);
64
64
  #endif
65
65
 
66
66
  #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
@@ -69,7 +69,7 @@ const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*10
69
69
 
70
70
  /** \internal */
71
71
  struct CacheSizes {
72
- CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
72
+ CacheSizes() : m_l1(-1), m_l2(-1), m_l3(-1) {
73
73
  int l1CacheSize, l2CacheSize, l3CacheSize;
74
74
  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
75
75
  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
@@ -83,27 +83,21 @@ struct CacheSizes {
83
83
  };
84
84
 
85
85
  /** \internal */
86
- inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
87
- {
86
+ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) {
88
87
  static CacheSizes m_cacheSizes;
89
88
 
90
- if(action==SetAction)
91
- {
89
+ if (action == SetAction) {
92
90
  // set the cpu cache size and cache all block sizes from a global cache size in byte
93
- eigen_internal_assert(l1!=0 && l2!=0);
91
+ eigen_internal_assert(l1 != 0 && l2 != 0);
94
92
  m_cacheSizes.m_l1 = *l1;
95
93
  m_cacheSizes.m_l2 = *l2;
96
94
  m_cacheSizes.m_l3 = *l3;
97
- }
98
- else if(action==GetAction)
99
- {
100
- eigen_internal_assert(l1!=0 && l2!=0);
95
+ } else if (action == GetAction) {
96
+ eigen_internal_assert(l1 != 0 && l2 != 0);
101
97
  *l1 = m_cacheSizes.m_l1;
102
98
  *l2 = m_cacheSizes.m_l2;
103
99
  *l3 = m_cacheSizes.m_l3;
104
- }
105
- else
106
- {
100
+ } else {
107
101
  eigen_internal_assert(false);
108
102
  }
109
103
  }
@@ -120,10 +114,9 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
120
114
  *
121
115
  * \sa setCpuCacheSizes */
122
116
 
123
- template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
124
- void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
125
- {
126
- typedef gebp_traits<LhsScalar,RhsScalar> Traits;
117
+ template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
118
+ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) {
119
+ typedef gebp_traits<LhsScalar, RhsScalar> Traits;
127
120
 
128
121
  // Explanations:
129
122
  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
@@ -132,7 +125,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
132
125
  // at the register level. This small horizontal panel has to stay within L1 cache.
133
126
  std::ptrdiff_t l1, l2, l3;
134
127
  manage_caching_sizes(GetAction, &l1, &l2, &l3);
135
- #ifdef EIGEN_VECTORIZE_AVX512
128
+ #ifdef EIGEN_VECTORIZE_AVX512
136
129
  // We need to find a rationale for that, but without this adjustment,
137
130
  // performance with AVX512 is pretty bad, like -20% slower.
138
131
  // One reason is that with increasing packet-size, the blocking size k
@@ -141,13 +134,13 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
141
134
  // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142
135
  // This is quite small for a good reuse of the accumulation registers.
143
136
  l1 *= 4;
144
- #endif
137
+ #endif
145
138
 
146
139
  if (num_threads > 1) {
147
140
  typedef typename Traits::ResScalar ResScalar;
148
141
  enum {
149
142
  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
150
- ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
143
+ ksub = Traits::mr * (Traits::nr * sizeof(ResScalar)),
151
144
  kr = 8,
152
145
  mr = Traits::mr,
153
146
  nr = Traits::nr
@@ -157,13 +150,13 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
157
150
  // increasing the value of k, so we'll cap it at 320 (value determined
158
151
  // experimentally).
159
152
  // To avoid that k vanishes, we make k_cache at least as big as kr
160
- const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
153
+ const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
161
154
  if (k_cache < k) {
162
155
  k = k_cache - (k_cache % kr);
163
156
  eigen_internal_assert(k > 0);
164
157
  }
165
158
 
166
- const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
159
+ const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
167
160
  const Index n_per_thread = numext::div_ceil(n, num_threads);
168
161
  if (n_cache <= n_per_thread) {
169
162
  // Don't exceed the capacity of the l2 cache.
@@ -176,37 +169,35 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
176
169
 
177
170
  if (l3 > l2) {
178
171
  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
179
- const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
172
+ const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
180
173
  const Index m_per_thread = numext::div_ceil(m, num_threads);
181
- if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
174
+ if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
182
175
  m = m_cache - (m_cache % mr);
183
176
  eigen_internal_assert(m > 0);
184
177
  } else {
185
178
  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
186
179
  }
187
180
  }
188
- }
189
- else {
181
+ } else {
190
182
  // In unit tests we do not want to use extra large matrices,
191
183
  // so we reduce the cache size to check the blocking strategy is not flawed
192
184
  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
193
- l1 = 9*1024;
194
- l2 = 32*1024;
195
- l3 = 512*1024;
185
+ l1 = 9 * 1024;
186
+ l2 = 32 * 1024;
187
+ l3 = 512 * 1024;
196
188
  #endif
197
189
 
198
190
  // Early return for small problems because the computation below are time consuming for small problems.
199
191
  // Perhaps it would make more sense to consider k*n*m??
200
192
  // Note that for very tiny problem, this function should be bypassed anyway
201
193
  // because we use the coefficient-based implementation for them.
202
- if((numext::maxi)(k,(numext::maxi)(m,n))<48)
203
- return;
194
+ if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
204
195
 
205
196
  typedef typename Traits::ResScalar ResScalar;
206
197
  enum {
207
198
  k_peeling = 8,
208
199
  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
209
- k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
200
+ k_sub = Traits::mr * (Traits::nr * sizeof(ResScalar))
210
201
  };
211
202
 
212
203
  // ---- 1st level of blocking on L1, yields kc ----
@@ -216,30 +207,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
216
207
  // We also include a register-level block of the result (mx x nr).
217
208
  // (In an ideal world only the lhs panel would stay in L1)
218
209
  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
219
- const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
210
+ const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
220
211
  const Index old_k = k;
221
- if(k>max_kc)
222
- {
212
+ if (k > max_kc) {
223
213
  // We are really blocking on the third dimension:
224
214
  // -> reduce blocking size to make sure the last block is as large as possible
225
215
  // while keeping the same number of sweeps over the result.
226
- k = (k%max_kc)==0 ? max_kc
227
- : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
216
+ k = (k % max_kc) == 0 ? max_kc
217
+ : max_kc - k_peeling * ((max_kc - 1 - (k % max_kc)) / (k_peeling * (k / max_kc + 1)));
228
218
 
229
- eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
219
+ eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
230
220
  }
231
221
 
232
- // ---- 2nd level of blocking on max(L2,L3), yields nc ----
222
+ // ---- 2nd level of blocking on max(L2,L3), yields nc ----
233
223
 
234
- // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
235
- // actual_l2 = max(l2, l3/nb_core_sharing_l3)
236
- // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
237
- // For instance, it corresponds to 6MB of L3 shared among 4 cores.
238
- #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
224
+ // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
225
+ // actual_l2 = max(l2, l3/nb_core_sharing_l3)
226
+ // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
227
+ // For instance, it corresponds to 6MB of L3 shared among 4 cores.
228
+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239
229
  const Index actual_l2 = l3;
240
- #else
241
- const Index actual_l2 = 1572864; // == 1.5 MB
242
- #endif
230
+ #else
231
+ const Index actual_l2 = 1572864; // == 1.5 MB
232
+ #endif
243
233
 
244
234
  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
245
235
  // The second half is implicitly reserved to access the result and lhs coefficients.
@@ -249,61 +239,52 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
249
239
  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
250
240
  Index max_nc;
251
241
  const Index lhs_bytes = m * k * sizeof(LhsScalar);
252
- const Index remaining_l1 = l1- k_sub - lhs_bytes;
253
- if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
254
- {
242
+ const Index remaining_l1 = l1 - k_sub - lhs_bytes;
243
+ if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
255
244
  // L1 blocking
256
- max_nc = remaining_l1 / (k*sizeof(RhsScalar));
257
- }
258
- else
259
- {
245
+ max_nc = remaining_l1 / (k * sizeof(RhsScalar));
246
+ } else {
260
247
  // L2 blocking
261
- max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
248
+ max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
262
249
  }
263
250
  // WARNING Below, we assume that Traits::nr is a power of two.
264
- Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
265
- if(n>nc)
266
- {
251
+ Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
252
+ if (n > nc) {
267
253
  // We are really blocking over the columns:
268
254
  // -> reduce blocking size to make sure the last block is as large as possible
269
255
  // while keeping the same number of sweeps over the packed lhs.
270
256
  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
271
- n = (n%nc)==0 ? nc
272
- : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
273
- }
274
- else if(old_k==k)
275
- {
257
+ n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
258
+ } else if (old_k == k) {
276
259
  // So far, no blocking at all, i.e., kc==k, and nc==n.
277
260
  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
278
- // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
279
- Index problem_size = k*n*sizeof(LhsScalar);
261
+ // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
262
+ // here should be obsolete.
263
+ Index problem_size = k * n * sizeof(LhsScalar);
280
264
  Index actual_lm = actual_l2;
281
265
  Index max_mc = m;
282
- if(problem_size<=1024)
283
- {
266
+ if (problem_size <= 1024) {
284
267
  // problem is small enough to keep in L1
285
268
  // Let's choose m such that lhs's block fit in 1/3 of L1
286
269
  actual_lm = l1;
287
- }
288
- else if(l3!=0 && problem_size<=32768)
289
- {
270
+ } else if (l3 != 0 && problem_size <= 32768) {
290
271
  // we have both L2 and L3, and problem is small enough to be kept in L2
291
272
  // Let's choose m such that lhs's block fit in 1/3 of L2
292
273
  actual_lm = l2;
293
- max_mc = (numext::mini<Index>)(576,max_mc);
274
+ max_mc = (numext::mini<Index>)(576, max_mc);
294
275
  }
295
- Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
296
- if (mc > Traits::mr) mc -= mc % Traits::mr;
297
- else if (mc==0) return;
298
- m = (m%mc)==0 ? mc
299
- : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
276
+ Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
277
+ if (mc > Traits::mr)
278
+ mc -= mc % Traits::mr;
279
+ else if (mc == 0)
280
+ return;
281
+ m = (m % mc) == 0 ? mc : (mc - Traits::mr * ((mc /*-1*/ - (m % mc)) / (Traits::mr * (m / mc + 1))));
300
282
  }
301
283
  }
302
284
  }
303
285
 
304
286
  template <typename Index>
305
- inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
306
- {
287
+ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) {
307
288
  #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
308
289
  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
309
290
  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
@@ -320,46 +301,47 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
320
301
  }
321
302
 
322
303
  /** \brief Computes the blocking parameters for a m x k times k x n matrix product
323
- *
324
- * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
325
- * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
326
- * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
327
- *
328
- * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
329
- * this function computes the blocking size parameters along the respective dimensions
330
- * for matrix products and related algorithms.
331
- *
332
- * The blocking size parameters may be evaluated:
333
- * - either by a heuristic based on cache sizes;
334
- * - or using fixed prescribed values (for testing purposes).
335
- *
336
- * \sa setCpuCacheSizes */
337
-
338
- template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
339
- void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
340
- {
304
+ *
305
+ * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
306
+ * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
307
+ * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same
308
+ * dimension.
309
+ * \param[in] num_threads Input: the number of threads used for the computation.
310
+ *
311
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
312
+ * this function computes the blocking size parameters along the respective dimensions
313
+ * for matrix products and related algorithms.
314
+ *
315
+ * The blocking size parameters may be evaluated:
316
+ * - either by a heuristic based on cache sizes;
317
+ * - or using fixed prescribed values (for testing purposes).
318
+ *
319
+ * \sa setCpuCacheSizes */
320
+
321
+ template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
322
+ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
341
323
  if (!useSpecificBlockingSizes(k, m, n)) {
342
324
  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
343
325
  }
344
326
  }
345
327
 
346
- template<typename LhsScalar, typename RhsScalar, typename Index>
347
- inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
348
- {
349
- computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
328
+ template <typename LhsScalar, typename RhsScalar, typename Index>
329
+ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
330
+ computeProductBlockingSizes<LhsScalar, RhsScalar, 1, Index>(k, m, n, num_threads);
350
331
  }
351
332
 
352
333
  template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
353
334
  struct RhsPanelHelper {
354
335
  private:
355
- static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
336
+ static constexpr int remaining_registers =
337
+ (std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0);
338
+
356
339
  public:
357
- typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
340
+ typedef std::conditional_t<remaining_registers >= 4, RhsPacketx4, RhsPacket> type;
358
341
  };
359
342
 
360
343
  template <typename Packet>
361
- struct QuadPacket
362
- {
344
+ struct QuadPacket {
363
345
  Packet B_0, B1, B2, B3;
364
346
  const Packet& get(const FixedInt<0>&) const { return B_0; }
365
347
  const Packet& get(const FixedInt<1>&) const { return B1; }
@@ -368,329 +350,295 @@ struct QuadPacket
368
350
  };
369
351
 
370
352
  template <int N, typename T1, typename T2, typename T3>
371
- struct packet_conditional { typedef T3 type; };
353
+ struct packet_conditional {
354
+ typedef T3 type;
355
+ };
372
356
 
373
357
  template <typename T1, typename T2, typename T3>
374
- struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
358
+ struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
359
+ typedef T1 type;
360
+ };
375
361
 
376
362
  template <typename T1, typename T2, typename T3>
377
- struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
378
-
379
- #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380
- typedef typename packet_conditional<packet_size, \
381
- typename packet_traits<name ## Scalar>::type, \
382
- typename packet_traits<name ## Scalar>::half, \
383
- typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384
- prefix ## name ## Packet
385
-
386
- #define PACKET_DECL_COND(name, packet_size) \
387
- typedef typename packet_conditional<packet_size, \
388
- typename packet_traits<name ## Scalar>::type, \
389
- typename packet_traits<name ## Scalar>::half, \
390
- typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391
- name ## Packet
392
-
393
- #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394
- typedef typename packet_conditional<packet_size, \
395
- typename packet_traits<Scalar>::type, \
396
- typename packet_traits<Scalar>::half, \
397
- typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398
- prefix ## ScalarPacket
399
-
400
- #define PACKET_DECL_COND_SCALAR(packet_size) \
401
- typedef typename packet_conditional<packet_size, \
402
- typename packet_traits<Scalar>::type, \
403
- typename packet_traits<Scalar>::half, \
404
- typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405
- ScalarPacket
363
+ struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
364
+ typedef T2 type;
365
+ };
366
+
367
+ #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
368
+ typedef typename packet_conditional< \
369
+ packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
370
+ typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
371
+
372
+ #define PACKET_DECL_COND(name, packet_size) \
373
+ typedef typename packet_conditional< \
374
+ packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
375
+ typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet
376
+
377
+ #define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
378
+ typedef typename packet_conditional< \
379
+ packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
380
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket##postfix
381
+
382
+ #define PACKET_DECL_COND_SCALAR(packet_size) \
383
+ typedef typename packet_conditional< \
384
+ packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
385
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket
406
386
 
407
387
  /* Vectorization logic
408
388
  * real*real: unpack rhs to constant packets, ...
409
- *
389
+ *
410
390
  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
411
391
  * storing each res packet into two packets (2x2),
412
- * at the end combine them: swap the second and addsub them
392
+ * at the end combine them: swap the second and addsub them
413
393
  * cf*cf : same but with 2x4 blocks
414
394
  * cplx*real : unpack rhs to constant packets, ...
415
395
  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
416
396
  */
417
- template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
418
- class gebp_traits
419
- {
420
- public:
421
- typedef _LhsScalar LhsScalar;
422
- typedef _RhsScalar RhsScalar;
397
+ template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
398
+ class gebp_traits {
399
+ public:
400
+ typedef LhsScalar_ LhsScalar;
401
+ typedef RhsScalar_ RhsScalar;
423
402
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
424
403
 
425
- PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
426
- PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
427
- PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
404
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
405
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
406
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
428
407
 
429
408
  enum {
430
- ConjLhs = _ConjLhs,
431
- ConjRhs = _ConjRhs,
432
- Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
433
- LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
434
- RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
435
- ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
436
-
409
+ ConjLhs = ConjLhs_,
410
+ ConjRhs = ConjRhs_,
411
+ Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
412
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
413
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
414
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
415
+
437
416
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
438
417
 
439
418
  // register block size along the N direction must be 1 or 4
440
419
  nr = 4,
441
420
 
442
421
  // register block size along the M direction (currently, this one cannot be modified)
443
- default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
444
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
445
- && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
422
+ default_mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
423
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && \
424
+ !defined(EIGEN_VECTORIZE_VSX) && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC >= 1914))
446
425
  // we assume 16 registers or more
447
426
  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
448
427
  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
449
428
  // Bug 1515: MSVC prior to v19.14 yields to register spilling.
450
- mr = Vectorizable ? 3*LhsPacketSize : default_mr,
429
+ mr = Vectorizable ? 3 * LhsPacketSize : default_mr,
451
430
  #else
452
431
  mr = default_mr,
453
432
  #endif
454
-
433
+
455
434
  LhsProgress = LhsPacketSize,
456
435
  RhsProgress = 1
457
436
  };
458
437
 
459
-
460
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
461
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
462
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
438
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
439
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
440
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
463
441
  typedef LhsPacket LhsPacket4Packing;
464
442
 
465
443
  typedef QuadPacket<RhsPacket> RhsPacketx4;
466
444
  typedef ResPacket AccPacket;
467
-
468
- EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
469
- {
470
- p = pset1<ResPacket>(ResScalar(0));
471
- }
472
445
 
473
- template<typename RhsPacketType>
474
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
475
- {
446
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
447
+
448
+ template <typename RhsPacketType>
449
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
476
450
  dest = pset1<RhsPacketType>(*b);
477
451
  }
478
452
 
479
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
480
- {
453
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
481
454
  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
482
455
  }
483
456
 
484
- template<typename RhsPacketType>
485
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
486
- {
457
+ template <typename RhsPacketType>
458
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
487
459
  loadRhs(b, dest);
488
460
  }
489
461
 
490
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
491
- {
492
- }
462
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
493
463
 
494
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
495
- {
496
- dest = ploadquad<RhsPacket>(b);
497
- }
464
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
498
465
 
499
- template<typename LhsPacketType>
500
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
501
- {
466
+ template <typename LhsPacketType>
467
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const {
502
468
  dest = pload<LhsPacketType>(a);
503
469
  }
504
470
 
505
- template<typename LhsPacketType>
506
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
507
- {
471
+ template <typename LhsPacketType>
472
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
508
473
  dest = ploadu<LhsPacketType>(a);
509
474
  }
510
475
 
511
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
512
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
513
- {
514
- conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
476
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
477
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
478
+ const LaneIdType&) const {
479
+ conj_helper<LhsPacketType, RhsPacketType, ConjLhs, ConjRhs> cj;
515
480
  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
516
481
  // let gcc allocate the register in which to store the result of the pmul
517
482
  // (in the case where there is no FMA) gcc fails to figure out how to avoid
518
483
  // spilling register.
519
484
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
520
485
  EIGEN_UNUSED_VARIABLE(tmp);
521
- c = cj.pmadd(a,b,c);
486
+ c = cj.pmadd(a, b, c);
522
487
  #else
523
- tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
488
+ tmp = b;
489
+ tmp = cj.pmul(a, tmp);
490
+ c = padd(c, tmp);
524
491
  #endif
525
492
  }
526
493
 
527
- template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
528
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
529
- {
494
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
495
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
496
+ const LaneIdType& lane) const {
530
497
  madd(a, b.get(lane), c, tmp, lane);
531
498
  }
532
499
 
533
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
534
- {
535
- r = pmadd(c,alpha,r);
536
- }
537
-
538
- template<typename ResPacketHalf>
539
- EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
540
- {
541
- r = pmadd(c,alpha,r);
500
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
501
+ r = pmadd(c, alpha, r);
542
502
  }
543
503
 
504
+ template <typename ResPacketHalf>
505
+ EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const {
506
+ r = pmadd(c, alpha, r);
507
+ }
544
508
  };
545
509
 
546
- template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
547
- class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
548
- {
549
- public:
510
+ template <typename RealScalar, bool ConjLhs_, int Arch, int PacketSize_>
511
+ class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_> {
512
+ public:
550
513
  typedef std::complex<RealScalar> LhsScalar;
551
514
  typedef RealScalar RhsScalar;
552
515
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
553
516
 
554
- PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
555
- PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
556
- PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
517
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
518
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
519
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
557
520
 
558
521
  enum {
559
- ConjLhs = _ConjLhs,
522
+ ConjLhs = ConjLhs_,
560
523
  ConjRhs = false,
561
- Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
562
- LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563
- RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564
- ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
565
-
524
+ Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
525
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
526
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
527
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
528
+
566
529
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
567
530
  nr = 4,
568
531
  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
569
532
  // we assume 16 registers
570
- mr = 3*LhsPacketSize,
533
+ mr = 3 * LhsPacketSize,
571
534
  #else
572
- mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
535
+ mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
573
536
  #endif
574
537
 
575
538
  LhsProgress = LhsPacketSize,
576
539
  RhsProgress = 1
577
540
  };
578
541
 
579
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
580
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
581
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
542
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
543
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
544
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
582
545
  typedef LhsPacket LhsPacket4Packing;
583
546
 
584
547
  typedef QuadPacket<RhsPacket> RhsPacketx4;
585
548
 
586
549
  typedef ResPacket AccPacket;
587
550
 
588
- EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
589
- {
590
- p = pset1<ResPacket>(ResScalar(0));
591
- }
551
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
592
552
 
593
- template<typename RhsPacketType>
594
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
595
- {
553
+ template <typename RhsPacketType>
554
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
596
555
  dest = pset1<RhsPacketType>(*b);
597
556
  }
598
557
 
599
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
600
- {
558
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
601
559
  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
602
560
  }
603
561
 
604
- template<typename RhsPacketType>
605
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
606
- {
562
+ template <typename RhsPacketType>
563
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
607
564
  loadRhs(b, dest);
608
565
  }
609
566
 
610
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
611
- {}
612
-
613
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
614
- {
615
- loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
567
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
568
+
569
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const {
570
+ loadRhsQuad_impl(b, dest, std::conditional_t<RhsPacketSize == 16, true_type, false_type>());
616
571
  }
617
572
 
618
- EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
619
- {
573
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const {
620
574
  // FIXME we can do better!
621
575
  // what we want here is a ploadheight
622
- RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
576
+ RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
623
577
  dest = ploadquad<RhsPacket>(tmp);
624
578
  }
625
579
 
626
- EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
627
- {
628
- eigen_internal_assert(RhsPacketSize<=8);
580
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const {
581
+ eigen_internal_assert(RhsPacketSize <= 8);
629
582
  dest = pset1<RhsPacket>(*b);
630
583
  }
631
584
 
632
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
633
- {
634
- dest = pload<LhsPacket>(a);
635
- }
585
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload<LhsPacket>(a); }
636
586
 
637
- template<typename LhsPacketType>
638
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
639
- {
587
+ template <typename LhsPacketType>
588
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
640
589
  dest = ploadu<LhsPacketType>(a);
641
590
  }
642
591
 
643
592
  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
644
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
645
- {
646
- madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
593
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
594
+ const LaneIdType&) const {
595
+ madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
647
596
  }
648
597
 
649
598
  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
650
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
651
- {
599
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
600
+ RhsPacketType& tmp, const true_type&) const {
652
601
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
653
602
  EIGEN_UNUSED_VARIABLE(tmp);
654
- c.v = pmadd(a.v,b,c.v);
603
+ c.v = pmadd(a.v, b, c.v);
655
604
  #else
656
- tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
605
+ tmp = b;
606
+ tmp = pmul(a.v, tmp);
607
+ c.v = padd(c.v, tmp);
657
608
  #endif
658
609
  }
659
610
 
660
- EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
661
- {
611
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
612
+ const false_type&) const {
662
613
  c += a * b;
663
614
  }
664
615
 
665
- template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
666
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
667
- {
616
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
617
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
618
+ const LaneIdType& lane) const {
668
619
  madd(a, b.get(lane), c, tmp, lane);
669
620
  }
670
621
 
671
622
  template <typename ResPacketType, typename AccPacketType>
672
- EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
673
- {
674
- conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
675
- r = cj.pmadd(c,alpha,r);
623
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
624
+ conj_helper<ResPacketType, ResPacketType, ConjLhs, false> cj;
625
+ r = cj.pmadd(c, alpha, r);
676
626
  }
677
627
 
678
- protected:
628
+ protected:
679
629
  };
680
630
 
681
- template<typename Packet>
682
- struct DoublePacket
683
- {
631
+ template <typename Packet>
632
+ struct DoublePacket {
684
633
  Packet first;
685
634
  Packet second;
686
635
  };
687
636
 
688
- template<typename Packet>
689
- DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
690
- {
637
+ template <typename Packet>
638
+ DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Packet>& b) {
691
639
  DoublePacket<Packet> res;
692
- res.first = padd(a.first, b.first);
693
- res.second = padd(a.second,b.second);
640
+ res.first = padd(a.first, b.first);
641
+ res.second = padd(a.second, b.second);
694
642
  return res;
695
643
  }
696
644
 
@@ -698,52 +646,47 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
698
646
  // corresponds to the number of complexes, so it means "8"
699
647
  // it terms of real coefficients.
700
648
 
701
- template<typename Packet>
702
- const DoublePacket<Packet>&
703
- predux_half_dowto4(const DoublePacket<Packet> &a,
704
- typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
705
- {
649
+ template <typename Packet>
650
+ const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
651
+ std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
706
652
  return a;
707
653
  }
708
654
 
709
- template<typename Packet>
710
- DoublePacket<typename unpacket_traits<Packet>::half>
711
- predux_half_dowto4(const DoublePacket<Packet> &a,
712
- typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
713
- {
655
+ template <typename Packet>
656
+ DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
657
+ const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
714
658
  // yes, that's pretty hackish :(
715
659
  DoublePacket<typename unpacket_traits<Packet>::half> res;
716
660
  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717
661
  typedef typename packet_traits<Cplx>::type CplxPacket;
718
- res.first = predux_half_dowto4(CplxPacket(a.first)).v;
662
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719
663
  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
720
664
  return res;
721
665
  }
722
666
 
723
667
  // same here, "quad" actually means "8" in terms of real coefficients
724
- template<typename Scalar, typename RealPacket>
668
+ template <typename Scalar, typename RealPacket>
725
669
  void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
726
- typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
727
- {
728
- dest.first = pset1<RealPacket>(numext::real(*b));
670
+ std::enable_if_t<unpacket_traits<RealPacket>::size <= 8>* = 0) {
671
+ dest.first = pset1<RealPacket>(numext::real(*b));
729
672
  dest.second = pset1<RealPacket>(numext::imag(*b));
730
673
  }
731
674
 
732
- template<typename Scalar, typename RealPacket>
675
+ template <typename Scalar, typename RealPacket>
733
676
  void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
734
- typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
735
- {
677
+ std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
736
678
  // yes, that's pretty hackish too :(
737
679
  typedef typename NumTraits<Scalar>::Real RealScalar;
738
680
  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739
681
  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740
- dest.first = ploadquad<RealPacket>(r);
682
+ dest.first = ploadquad<RealPacket>(r);
741
683
  dest.second = ploadquad<RealPacket>(i);
742
684
  }
743
685
 
744
-
745
- template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
686
+ template <typename Packet>
687
+ struct unpacket_traits<DoublePacket<Packet> > {
746
688
  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
689
+ enum { size = 2 * unpacket_traits<Packet>::size };
747
690
  };
748
691
  // template<typename Packet>
749
692
  // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
@@ -754,74 +697,66 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
754
697
  // return res;
755
698
  // }
756
699
 
757
- template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
758
- class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
759
- {
760
- public:
761
- typedef std::complex<RealScalar> Scalar;
762
- typedef std::complex<RealScalar> LhsScalar;
763
- typedef std::complex<RealScalar> RhsScalar;
764
- typedef std::complex<RealScalar> ResScalar;
765
-
766
- PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
767
- PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
768
- PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
769
- PACKET_DECL_COND(Real, _PacketSize);
770
- PACKET_DECL_COND_SCALAR(_PacketSize);
700
+ template <typename RealScalar, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
701
+ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_> {
702
+ public:
703
+ typedef std::complex<RealScalar> Scalar;
704
+ typedef std::complex<RealScalar> LhsScalar;
705
+ typedef std::complex<RealScalar> RhsScalar;
706
+ typedef std::complex<RealScalar> ResScalar;
707
+
708
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
709
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
710
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
711
+ PACKET_DECL_COND(Real, PacketSize_);
712
+ PACKET_DECL_COND_SCALAR(PacketSize_);
771
713
 
772
714
  enum {
773
- ConjLhs = _ConjLhs,
774
- ConjRhs = _ConjRhs,
775
- Vectorizable = unpacket_traits<RealPacket>::vectorizable
776
- && unpacket_traits<ScalarPacket>::vectorizable,
777
- ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778
- LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
715
+ ConjLhs = ConjLhs_,
716
+ ConjRhs = ConjRhs_,
717
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable && unpacket_traits<ScalarPacket>::vectorizable,
718
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
719
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
779
720
  RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780
- RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
721
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
722
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
781
723
 
782
- // FIXME: should depend on NumberOfRegisters
783
724
  nr = 4,
784
- mr = ResPacketSize,
725
+ mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
785
726
 
786
727
  LhsProgress = ResPacketSize,
787
728
  RhsProgress = 1
788
729
  };
789
-
790
- typedef DoublePacket<RealPacket> DoublePacketType;
791
730
 
792
- typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
793
- typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
794
- typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
795
- typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
796
- typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
731
+ typedef DoublePacket<RealPacket> DoublePacketType;
797
732
 
798
- // this actualy holds 8 packets!
733
+ typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> LhsPacket4Packing;
734
+ typedef std::conditional_t<Vectorizable, RealPacket, Scalar> LhsPacket;
735
+ typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> RhsPacket;
736
+ typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> ResPacket;
737
+ typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> AccPacket;
738
+
739
+ // this actually holds 8 packets!
799
740
  typedef QuadPacket<RhsPacket> RhsPacketx4;
800
-
741
+
801
742
  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
802
743
 
803
- EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
804
- {
805
- p.first = pset1<RealPacket>(RealScalar(0));
806
- p.second = pset1<RealPacket>(RealScalar(0));
744
+ EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) {
745
+ p.first = pset1<RealPacket>(RealScalar(0));
746
+ p.second = pset1<RealPacket>(RealScalar(0));
807
747
  }
808
748
 
809
749
  // Scalar path
810
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
811
- {
812
- dest = pset1<ScalarPacket>(*b);
813
- }
750
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1<ScalarPacket>(*b); }
814
751
 
815
752
  // Vectorized path
816
- template<typename RealPacketType>
817
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
818
- {
819
- dest.first = pset1<RealPacketType>(numext::real(*b));
753
+ template <typename RealPacketType>
754
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
755
+ dest.first = pset1<RealPacketType>(numext::real(*b));
820
756
  dest.second = pset1<RealPacketType>(numext::imag(*b));
821
757
  }
822
758
 
823
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
824
- {
759
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
825
760
  loadRhs(b, dest.B_0);
826
761
  loadRhs(b + 1, dest.B1);
827
762
  loadRhs(b + 2, dest.B2);
@@ -829,221 +764,189 @@ public:
829
764
  }
830
765
 
831
766
  // Scalar path
832
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
833
- {
834
- loadRhs(b, dest);
835
- }
767
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); }
836
768
 
837
769
  // Vectorized path
838
- template<typename RealPacketType>
839
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
840
- {
770
+ template <typename RealPacketType>
771
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
841
772
  loadRhs(b, dest);
842
773
  }
843
774
 
844
775
  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
845
-
846
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
847
- {
848
- loadRhs(b,dest);
849
- }
850
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
851
- {
852
- loadQuadToDoublePacket(b,dest);
776
+
777
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); }
778
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const {
779
+ loadQuadToDoublePacket(b, dest);
853
780
  }
854
781
 
855
782
  // nothing special here
856
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
857
- {
783
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const {
858
784
  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
859
785
  }
860
786
 
861
- template<typename LhsPacketType>
862
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
863
- {
787
+ template <typename LhsPacketType>
788
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
864
789
  dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
865
790
  }
866
791
 
867
- template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
868
- EIGEN_STRONG_INLINE
869
- typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
870
- madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
871
- {
872
- c.first = padd(pmul(a,b.first), c.first);
873
- c.second = padd(pmul(a,b.second),c.second);
792
+ template <typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType,
793
+ typename LaneIdType>
794
+ EIGEN_STRONG_INLINE std::enable_if_t<!is_same<RhsPacketType, RhsPacketx4>::value> madd(const LhsPacketType& a,
795
+ const RhsPacketType& b,
796
+ DoublePacket<ResPacketType>& c,
797
+ TmpType& /*tmp*/,
798
+ const LaneIdType&) const {
799
+ c.first = pmadd(a, b.first, c.first);
800
+ c.second = pmadd(a, b.second, c.second);
874
801
  }
875
802
 
876
- template<typename LaneIdType>
877
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
878
- {
879
- c = cj.pmadd(a,b,c);
803
+ template <typename LaneIdType>
804
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/,
805
+ const LaneIdType&) const {
806
+ c = cj.pmadd(a, b, c);
880
807
  }
881
808
 
882
- template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
883
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
884
- {
809
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
810
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
811
+ const LaneIdType& lane) const {
885
812
  madd(a, b.get(lane), c, tmp, lane);
886
813
  }
887
-
814
+
888
815
  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
889
-
890
- template<typename RealPacketType, typename ResPacketType>
891
- EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
892
- {
816
+
817
+ template <typename RealPacketType, typename ResPacketType>
818
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha,
819
+ ResPacketType& r) const {
893
820
  // assemble c
894
821
  ResPacketType tmp;
895
- if((!ConjLhs)&&(!ConjRhs))
896
- {
822
+ if ((!ConjLhs) && (!ConjRhs)) {
897
823
  tmp = pcplxflip(pconj(ResPacketType(c.second)));
898
- tmp = padd(ResPacketType(c.first),tmp);
899
- }
900
- else if((!ConjLhs)&&(ConjRhs))
901
- {
824
+ tmp = padd(ResPacketType(c.first), tmp);
825
+ } else if ((!ConjLhs) && (ConjRhs)) {
902
826
  tmp = pconj(pcplxflip(ResPacketType(c.second)));
903
- tmp = padd(ResPacketType(c.first),tmp);
904
- }
905
- else if((ConjLhs)&&(!ConjRhs))
906
- {
827
+ tmp = padd(ResPacketType(c.first), tmp);
828
+ } else if ((ConjLhs) && (!ConjRhs)) {
907
829
  tmp = pcplxflip(ResPacketType(c.second));
908
- tmp = padd(pconj(ResPacketType(c.first)),tmp);
909
- }
910
- else if((ConjLhs)&&(ConjRhs))
911
- {
830
+ tmp = padd(pconj(ResPacketType(c.first)), tmp);
831
+ } else if ((ConjLhs) && (ConjRhs)) {
912
832
  tmp = pcplxflip(ResPacketType(c.second));
913
- tmp = psub(pconj(ResPacketType(c.first)),tmp);
833
+ tmp = psub(pconj(ResPacketType(c.first)), tmp);
914
834
  }
915
-
916
- r = pmadd(tmp,alpha,r);
835
+
836
+ r = pmadd(tmp, alpha, r);
917
837
  }
918
838
 
919
- protected:
920
- conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
839
+ protected:
840
+ conj_helper<LhsScalar, RhsScalar, ConjLhs, ConjRhs> cj;
921
841
  };
922
842
 
923
- template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
924
- class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
925
- {
926
- public:
927
- typedef std::complex<RealScalar> Scalar;
928
- typedef RealScalar LhsScalar;
929
- typedef Scalar RhsScalar;
930
- typedef Scalar ResScalar;
931
-
932
- PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
933
- PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
934
- PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
935
- PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
936
- PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
937
-
938
- #undef PACKET_DECL_COND_SCALAR_PREFIX
939
- #undef PACKET_DECL_COND_PREFIX
843
+ template <typename RealScalar, bool ConjRhs_, int Arch, int PacketSize_>
844
+ class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_> {
845
+ public:
846
+ typedef std::complex<RealScalar> Scalar;
847
+ typedef RealScalar LhsScalar;
848
+ typedef Scalar RhsScalar;
849
+ typedef Scalar ResScalar;
850
+
851
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
852
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
853
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
854
+ PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_);
855
+ PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_);
856
+
857
+ #undef PACKET_DECL_COND_SCALAR_POSTFIX
858
+ #undef PACKET_DECL_COND_POSTFIX
940
859
  #undef PACKET_DECL_COND_SCALAR
941
860
  #undef PACKET_DECL_COND
942
861
 
943
862
  enum {
944
863
  ConjLhs = false,
945
- ConjRhs = _ConjRhs,
946
- Vectorizable = unpacket_traits<_RealPacket>::vectorizable
947
- && unpacket_traits<_ScalarPacket>::vectorizable,
948
- LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949
- RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950
- ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
951
-
864
+ ConjRhs = ConjRhs_,
865
+ Vectorizable = unpacket_traits<RealPacket_>::vectorizable && unpacket_traits<ScalarPacket_>::vectorizable,
866
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
867
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
868
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
869
+
952
870
  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
953
871
  // FIXME: should depend on NumberOfRegisters
954
872
  nr = 4,
955
- mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
873
+ mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
956
874
 
957
875
  LhsProgress = ResPacketSize,
958
876
  RhsProgress = 1
959
877
  };
960
878
 
961
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
962
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
963
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
879
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
880
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
881
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
964
882
  typedef LhsPacket LhsPacket4Packing;
965
883
  typedef QuadPacket<RhsPacket> RhsPacketx4;
966
884
  typedef ResPacket AccPacket;
967
885
 
968
- EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
969
- {
970
- p = pset1<ResPacket>(ResScalar(0));
971
- }
886
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
972
887
 
973
- template<typename RhsPacketType>
974
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
975
- {
888
+ template <typename RhsPacketType>
889
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
976
890
  dest = pset1<RhsPacketType>(*b);
977
891
  }
978
892
 
979
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
980
- {
893
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
981
894
  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
982
895
  }
983
896
 
984
- template<typename RhsPacketType>
985
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
986
- {
897
+ template <typename RhsPacketType>
898
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
987
899
  loadRhs(b, dest);
988
900
  }
989
901
 
990
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
991
- {}
902
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
992
903
 
993
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
994
- {
995
- dest = ploaddup<LhsPacket>(a);
996
- }
997
-
998
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
999
- {
1000
- dest = ploadquad<RhsPacket>(b);
1001
- }
904
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); }
905
+
906
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
1002
907
 
1003
- template<typename LhsPacketType>
1004
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
1005
- {
908
+ template <typename LhsPacketType>
909
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
1006
910
  dest = ploaddup<LhsPacketType>(a);
1007
911
  }
1008
912
 
1009
913
  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1010
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
1011
- {
1012
- madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
914
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
915
+ const LaneIdType&) const {
916
+ madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
1013
917
  }
1014
918
 
1015
919
  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1016
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
1017
- {
920
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
921
+ RhsPacketType& tmp, const true_type&) const {
1018
922
  #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1019
923
  EIGEN_UNUSED_VARIABLE(tmp);
1020
- c.v = pmadd(a,b.v,c.v);
924
+ c.v = pmadd(a, b.v, c.v);
1021
925
  #else
1022
- tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
926
+ tmp = b;
927
+ tmp.v = pmul(a, tmp.v);
928
+ c = padd(c, tmp);
1023
929
  #endif
1024
-
1025
930
  }
1026
931
 
1027
- EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
1028
- {
932
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
933
+ const false_type&) const {
1029
934
  c += a * b;
1030
935
  }
1031
936
 
1032
- template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1033
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1034
- {
937
+ template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
938
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
939
+ const LaneIdType& lane) const {
1035
940
  madd(a, b.get(lane), c, tmp, lane);
1036
941
  }
1037
942
 
1038
943
  template <typename ResPacketType, typename AccPacketType>
1039
- EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
1040
- {
1041
- conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
1042
- r = cj.pmadd(alpha,c,r);
944
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
945
+ conj_helper<ResPacketType, ResPacketType, false, ConjRhs> cj;
946
+ r = cj.pmadd(alpha, c, r);
1043
947
  }
1044
948
 
1045
- protected:
1046
-
949
+ protected:
1047
950
  };
1048
951
 
1049
952
  /* optimized General packed Block * packed Panel product kernel
@@ -1053,13 +956,15 @@ protected:
1053
956
  * |real |cplx | no vectorization yet, would require to pack A with duplication
1054
957
  * |cplx |real | easy vectorization
1055
958
  */
1056
- template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1057
- struct gebp_kernel
1058
- {
1059
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1060
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1061
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1062
-
959
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
960
+ bool ConjugateLhs, bool ConjugateRhs>
961
+ struct gebp_kernel {
962
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
963
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf>
964
+ HalfTraits;
965
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter>
966
+ QuarterTraits;
967
+
1063
968
  typedef typename Traits::ResScalar ResScalar;
1064
969
  typedef typename Traits::LhsPacket LhsPacket;
1065
970
  typedef typename Traits::RhsPacket RhsPacket;
@@ -1068,8 +973,9 @@ struct gebp_kernel
1068
973
  typedef typename Traits::RhsPacketx4 RhsPacketx4;
1069
974
 
1070
975
  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
976
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type RhsPanel27;
1071
977
 
1072
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
978
+ typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
1073
979
 
1074
980
  typedef typename SwappedTraits::ResScalar SResScalar;
1075
981
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
@@ -1090,28 +996,28 @@ struct gebp_kernel
1090
996
  typedef typename DataMapper::LinearMapper LinearMapper;
1091
997
 
1092
998
  enum {
1093
- Vectorizable = Traits::Vectorizable,
1094
- LhsProgress = Traits::LhsProgress,
1095
- LhsProgressHalf = HalfTraits::LhsProgress,
1096
- LhsProgressQuarter = QuarterTraits::LhsProgress,
1097
- RhsProgress = Traits::RhsProgress,
1098
- RhsProgressHalf = HalfTraits::RhsProgress,
1099
- RhsProgressQuarter = QuarterTraits::RhsProgress,
999
+ Vectorizable = Traits::Vectorizable,
1000
+ LhsProgress = Traits::LhsProgress,
1001
+ LhsProgressHalf = HalfTraits::LhsProgress,
1002
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
1003
+ RhsProgress = Traits::RhsProgress,
1004
+ RhsProgressHalf = HalfTraits::RhsProgress,
1005
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
1100
1006
  ResPacketSize = Traits::ResPacketSize
1101
1007
  };
1102
1008
 
1103
- EIGEN_DONT_INLINE
1104
- void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1105
- Index rows, Index depth, Index cols, ResScalar alpha,
1106
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
1009
+ EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
1010
+ Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
1011
+ Index offsetA = 0, Index offsetB = 0);
1107
1012
  };
1108
1013
 
1109
- template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1110
- int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1111
- struct last_row_process_16_packets
1112
- {
1113
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1114
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1014
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1015
+ bool ConjugateLhs, bool ConjugateRhs,
1016
+ int SwappedLhsProgress =
1017
+ gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target>::LhsProgress>
1018
+ struct last_row_process_16_packets {
1019
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
1020
+ typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
1115
1021
 
1116
1022
  typedef typename Traits::ResScalar ResScalar;
1117
1023
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
@@ -1119,28 +1025,27 @@ struct last_row_process_16_packets
1119
1025
  typedef typename SwappedTraits::ResPacket SResPacket;
1120
1026
  typedef typename SwappedTraits::AccPacket SAccPacket;
1121
1027
 
1122
- EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1123
- const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1124
- ResScalar alpha, SAccPacket &C0)
1125
- {
1126
- EIGEN_UNUSED_VARIABLE(res);
1127
- EIGEN_UNUSED_VARIABLE(straits);
1128
- EIGEN_UNUSED_VARIABLE(blA);
1129
- EIGEN_UNUSED_VARIABLE(blB);
1130
- EIGEN_UNUSED_VARIABLE(depth);
1131
- EIGEN_UNUSED_VARIABLE(endk);
1132
- EIGEN_UNUSED_VARIABLE(i);
1133
- EIGEN_UNUSED_VARIABLE(j2);
1134
- EIGEN_UNUSED_VARIABLE(alpha);
1135
- EIGEN_UNUSED_VARIABLE(C0);
1136
- }
1028
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
1029
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1030
+ ResScalar alpha, SAccPacket& C0) {
1031
+ EIGEN_UNUSED_VARIABLE(res);
1032
+ EIGEN_UNUSED_VARIABLE(straits);
1033
+ EIGEN_UNUSED_VARIABLE(blA);
1034
+ EIGEN_UNUSED_VARIABLE(blB);
1035
+ EIGEN_UNUSED_VARIABLE(depth);
1036
+ EIGEN_UNUSED_VARIABLE(endk);
1037
+ EIGEN_UNUSED_VARIABLE(i);
1038
+ EIGEN_UNUSED_VARIABLE(j2);
1039
+ EIGEN_UNUSED_VARIABLE(alpha);
1040
+ EIGEN_UNUSED_VARIABLE(C0);
1041
+ }
1137
1042
  };
1138
1043
 
1139
-
1140
- template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1141
- struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1142
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1143
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1044
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1045
+ bool ConjugateLhs, bool ConjugateRhs>
1046
+ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1047
+ typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
1048
+ typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
1144
1049
 
1145
1050
  typedef typename Traits::ResScalar ResScalar;
1146
1051
  typedef typename SwappedTraits::LhsPacket SLhsPacket;
@@ -1148,10 +1053,9 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
1148
1053
  typedef typename SwappedTraits::ResPacket SResPacket;
1149
1054
  typedef typename SwappedTraits::AccPacket SAccPacket;
1150
1055
 
1151
- EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1152
- const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1153
- ResScalar alpha, SAccPacket &C0)
1154
- {
1056
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
1057
+ const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1058
+ ResScalar alpha, SAccPacket& C0) {
1155
1059
  typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1156
1060
  typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1157
1061
  typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
@@ -1160,71 +1064,190 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
1160
1064
  SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161
1065
  SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1162
1066
 
1163
- if (depth - endk > 0)
1164
- {
1165
- // We have to handle the last row(s) of the rhs, which
1166
- // correspond to a half-packet
1167
- SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1168
-
1169
- for (Index kk = endk; kk < depth; kk++)
1170
- {
1171
- SLhsPacketQuarter a0;
1172
- SRhsPacketQuarter b0;
1173
- straits.loadLhsUnaligned(blB, a0);
1174
- straits.loadRhs(blA, b0);
1175
- straits.madd(a0,b0,c0,b0, fix<0>);
1176
- blB += SwappedTraits::LhsProgress/4;
1177
- blA += 1;
1178
- }
1179
- straits.acc(c0, alphav, R);
1180
- }
1181
- else
1182
- {
1183
- straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1067
+ if (depth - endk > 0) {
1068
+ // We have to handle the last row(s) of the rhs, which
1069
+ // correspond to a half-packet
1070
+ SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1071
+
1072
+ for (Index kk = endk; kk < depth; kk++) {
1073
+ SLhsPacketQuarter a0;
1074
+ SRhsPacketQuarter b0;
1075
+ straits.loadLhsUnaligned(blB, a0);
1076
+ straits.loadRhs(blA, b0);
1077
+ straits.madd(a0, b0, c0, b0, fix<0>);
1078
+ blB += SwappedTraits::LhsProgress / 4;
1079
+ blA += 1;
1184
1080
  }
1081
+ straits.acc(c0, alphav, R);
1082
+ } else {
1083
+ straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1084
+ }
1185
1085
  res.scatterPacket(i, j2, R);
1186
1086
  }
1187
1087
  };
1188
1088
 
1189
- template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1190
- struct lhs_process_one_packet
1191
- {
1089
+ template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
1090
+ typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
1091
+ typename LinearMapper, typename DataMapper>
1092
+ struct lhs_process_one_packet {
1192
1093
  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1193
1094
 
1194
- EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1195
- {
1095
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
1096
+ LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
1097
+ AccPacket* C1, AccPacket* C2, AccPacket* C3) {
1196
1098
  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1197
1099
  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1198
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199
- traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1100
+ traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
1101
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
1200
1102
  traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201
1103
  traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202
1104
  traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203
1105
  traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204
- #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205
- __asm__ ("" : "+x,m" (*A0));
1206
- #endif
1106
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1107
+ __asm__("" : "+x,m"(*A0));
1108
+ #endif
1207
1109
  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1208
1110
  }
1209
1111
 
1210
- EIGEN_STRONG_INLINE void operator()(
1211
- const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1212
- Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213
- int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1214
- {
1112
+ EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1113
+ ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
1114
+ Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
1115
+ Index cols, Index depth, Index packet_cols4) {
1215
1116
  GEBPTraits traits;
1216
-
1117
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1217
1118
  // loops on each largest micro horizontal panel of lhs
1218
1119
  // (LhsProgress x depth)
1219
- for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1220
- {
1120
+ for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
1121
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1122
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1123
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1124
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1125
+ prefetch(&blA[0]);
1126
+
1127
+ // gets res block as register
1128
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
1129
+ traits.initAcc(C0);
1130
+ traits.initAcc(C1);
1131
+ traits.initAcc(C2);
1132
+ traits.initAcc(C3);
1133
+ traits.initAcc(C4);
1134
+ traits.initAcc(C5);
1135
+ traits.initAcc(C6);
1136
+ traits.initAcc(C7);
1137
+
1138
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1139
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1140
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1141
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1142
+ LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1143
+ LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1144
+ LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1145
+ LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1146
+ r0.prefetch(prefetch_res_offset);
1147
+ r1.prefetch(prefetch_res_offset);
1148
+ r2.prefetch(prefetch_res_offset);
1149
+ r3.prefetch(prefetch_res_offset);
1150
+ r4.prefetch(prefetch_res_offset);
1151
+ r5.prefetch(prefetch_res_offset);
1152
+ r6.prefetch(prefetch_res_offset);
1153
+ r7.prefetch(prefetch_res_offset);
1154
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1155
+ prefetch(&blB[0]);
1156
+
1157
+ LhsPacket A0;
1158
+ for (Index k = 0; k < peeled_kc; k += pk) {
1159
+ RhsPacketx4 rhs_panel;
1160
+ RhsPacket T0;
1161
+ #define EIGEN_GEBGP_ONESTEP(K) \
1162
+ do { \
1163
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1164
+ traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1165
+ traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1166
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1167
+ traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1168
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1169
+ traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1170
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1171
+ traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1172
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1173
+ traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1174
+ traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1175
+ traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1176
+ traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1177
+ traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1178
+ traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1179
+ traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1180
+ traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1181
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1182
+ } while (false)
1183
+
1184
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
1185
+
1186
+ EIGEN_GEBGP_ONESTEP(0);
1187
+ EIGEN_GEBGP_ONESTEP(1);
1188
+ EIGEN_GEBGP_ONESTEP(2);
1189
+ EIGEN_GEBGP_ONESTEP(3);
1190
+ EIGEN_GEBGP_ONESTEP(4);
1191
+ EIGEN_GEBGP_ONESTEP(5);
1192
+ EIGEN_GEBGP_ONESTEP(6);
1193
+ EIGEN_GEBGP_ONESTEP(7);
1194
+
1195
+ blB += pk * 8 * RhsProgress;
1196
+ blA += pk * (1 * LhsProgress);
1197
+
1198
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
1199
+ }
1200
+ // process remaining peeled loop
1201
+ for (Index k = peeled_kc; k < depth; k++) {
1202
+ RhsPacketx4 rhs_panel;
1203
+ RhsPacket T0;
1204
+ EIGEN_GEBGP_ONESTEP(0);
1205
+ blB += 8 * RhsProgress;
1206
+ blA += 1 * LhsProgress;
1207
+ }
1208
+
1209
+ #undef EIGEN_GEBGP_ONESTEP
1210
+
1211
+ ResPacket R0, R1;
1212
+ ResPacket alphav = pset1<ResPacket>(alpha);
1213
+
1214
+ R0 = r0.template loadPacket<ResPacket>(0);
1215
+ R1 = r1.template loadPacket<ResPacket>(0);
1216
+ traits.acc(C0, alphav, R0);
1217
+ traits.acc(C1, alphav, R1);
1218
+ r0.storePacket(0, R0);
1219
+ r1.storePacket(0, R1);
1220
+
1221
+ R0 = r2.template loadPacket<ResPacket>(0);
1222
+ R1 = r3.template loadPacket<ResPacket>(0);
1223
+ traits.acc(C2, alphav, R0);
1224
+ traits.acc(C3, alphav, R1);
1225
+ r2.storePacket(0, R0);
1226
+ r3.storePacket(0, R1);
1227
+
1228
+ R0 = r4.template loadPacket<ResPacket>(0);
1229
+ R1 = r5.template loadPacket<ResPacket>(0);
1230
+ traits.acc(C4, alphav, R0);
1231
+ traits.acc(C5, alphav, R1);
1232
+ r4.storePacket(0, R0);
1233
+ r5.storePacket(0, R1);
1234
+
1235
+ R0 = r6.template loadPacket<ResPacket>(0);
1236
+ R1 = r7.template loadPacket<ResPacket>(0);
1237
+ traits.acc(C6, alphav, R0);
1238
+ traits.acc(C7, alphav, R1);
1239
+ r6.storePacket(0, R0);
1240
+ r7.storePacket(0, R1);
1241
+ }
1242
+ }
1243
+ #endif
1244
+
1221
1245
  // loops on each largest micro vertical panel of rhs (depth * nr)
1222
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1223
- {
1246
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1224
1247
  // We select a LhsProgress x nr micro block of res
1225
1248
  // which is entirely stored into 1 x nr registers.
1226
1249
 
1227
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1250
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1228
1251
  prefetch(&blA[0]);
1229
1252
 
1230
1253
  // gets res block as register
@@ -1235,7 +1258,7 @@ struct lhs_process_one_packet
1235
1258
  traits.initAcc(C3);
1236
1259
  // To improve instruction pipelining, let's double the accumulation registers:
1237
1260
  // even k will accumulate in C*, while odd k will accumulate in D*.
1238
- // This trick is crutial to get good performance with FMA, otherwise it is
1261
+ // This trick is crucial to get good performance with FMA, otherwise it is
1239
1262
  // actually faster to perform separated MUL+ADD because of a naturally
1240
1263
  // better instruction-level parallelism.
1241
1264
  AccPacket D0, D1, D2, D3;
@@ -1255,44 +1278,42 @@ struct lhs_process_one_packet
1255
1278
  r3.prefetch(prefetch_res_offset);
1256
1279
 
1257
1280
  // performs "inner" products
1258
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1281
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1259
1282
  prefetch(&blB[0]);
1260
1283
  LhsPacket A0, A1;
1261
1284
 
1262
- for(Index k=0; k<peeled_kc; k+=pk)
1263
- {
1285
+ for (Index k = 0; k < peeled_kc; k += pk) {
1264
1286
  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1265
1287
  RhsPacketx4 rhs_panel;
1266
1288
  RhsPacket T0;
1267
1289
 
1268
- internal::prefetch(blB+(48+0));
1290
+ internal::prefetch(blB + (48 + 0));
1269
1291
  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270
1292
  peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271
1293
  peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272
1294
  peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273
- internal::prefetch(blB+(48+16));
1295
+ internal::prefetch(blB + (48 + 16));
1274
1296
  peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275
1297
  peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276
1298
  peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277
1299
  peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1278
1300
 
1279
- blB += pk*4*RhsProgress;
1280
- blA += pk*LhsProgress;
1301
+ blB += pk * 4 * RhsProgress;
1302
+ blA += pk * LhsProgress;
1281
1303
 
1282
1304
  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1283
1305
  }
1284
- C0 = padd(C0,D0);
1285
- C1 = padd(C1,D1);
1286
- C2 = padd(C2,D2);
1287
- C3 = padd(C3,D3);
1306
+ C0 = padd(C0, D0);
1307
+ C1 = padd(C1, D1);
1308
+ C2 = padd(C2, D2);
1309
+ C3 = padd(C3, D3);
1288
1310
 
1289
1311
  // process remaining peeled loop
1290
- for(Index k=peeled_kc; k<depth; k++)
1291
- {
1312
+ for (Index k = peeled_kc; k < depth; k++) {
1292
1313
  RhsPacketx4 rhs_panel;
1293
1314
  RhsPacket T0;
1294
1315
  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295
- blB += 4*RhsProgress;
1316
+ blB += 4 * RhsProgress;
1296
1317
  blA += LhsProgress;
1297
1318
  }
1298
1319
 
@@ -1302,23 +1323,22 @@ struct lhs_process_one_packet
1302
1323
  R0 = r0.template loadPacket<ResPacket>(0);
1303
1324
  R1 = r1.template loadPacket<ResPacket>(0);
1304
1325
  traits.acc(C0, alphav, R0);
1305
- traits.acc(C1, alphav, R1);
1326
+ traits.acc(C1, alphav, R1);
1306
1327
  r0.storePacket(0, R0);
1307
1328
  r1.storePacket(0, R1);
1308
1329
 
1309
1330
  R0 = r2.template loadPacket<ResPacket>(0);
1310
1331
  R1 = r3.template loadPacket<ResPacket>(0);
1311
- traits.acc(C2, alphav, R0);
1312
- traits.acc(C3, alphav, R1);
1332
+ traits.acc(C2, alphav, R0);
1333
+ traits.acc(C3, alphav, R1);
1313
1334
  r2.storePacket(0, R0);
1314
1335
  r3.storePacket(0, R1);
1315
1336
  }
1316
1337
 
1317
1338
  // Deal with remaining columns of the rhs
1318
- for(Index j2=packet_cols4; j2<cols; j2++)
1319
- {
1339
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
1320
1340
  // One column at a time
1321
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1341
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1322
1342
  prefetch(&blA[0]);
1323
1343
 
1324
1344
  // gets res block as register
@@ -1328,24 +1348,23 @@ struct lhs_process_one_packet
1328
1348
  LinearMapper r0 = res.getLinearMapper(i, j2);
1329
1349
 
1330
1350
  // performs "inner" products
1331
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1351
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1332
1352
  LhsPacket A0;
1333
1353
 
1334
- for(Index k= 0; k<peeled_kc; k+=pk)
1335
- {
1354
+ for (Index k = 0; k < peeled_kc; k += pk) {
1336
1355
  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1337
1356
  RhsPacket B_0;
1338
1357
 
1339
- #define EIGEN_GEBGP_ONESTEP(K) \
1340
- do { \
1341
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1343
- /* FIXME: why unaligned???? */ \
1344
- traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346
- traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1348
- } while(false);
1358
+ #define EIGEN_GEBGP_ONESTEP(K) \
1359
+ do { \
1360
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1361
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1362
+ /* FIXME: why unaligned???? */ \
1363
+ traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
1364
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1365
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1366
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1367
+ } while (false);
1349
1368
 
1350
1369
  EIGEN_GEBGP_ONESTEP(0);
1351
1370
  EIGEN_GEBGP_ONESTEP(1);
@@ -1356,15 +1375,14 @@ struct lhs_process_one_packet
1356
1375
  EIGEN_GEBGP_ONESTEP(6);
1357
1376
  EIGEN_GEBGP_ONESTEP(7);
1358
1377
 
1359
- blB += pk*RhsProgress;
1360
- blA += pk*LhsProgress;
1378
+ blB += pk * RhsProgress;
1379
+ blA += pk * LhsProgress;
1361
1380
 
1362
1381
  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1363
1382
  }
1364
1383
 
1365
1384
  // process remaining peeled loop
1366
- for(Index k=peeled_kc; k<depth; k++)
1367
- {
1385
+ for (Index k = peeled_kc; k < depth; k++) {
1368
1386
  RhsPacket B_0;
1369
1387
  EIGEN_GEBGP_ONESTEP(0);
1370
1388
  blB += RhsProgress;
@@ -1381,84 +1399,321 @@ struct lhs_process_one_packet
1381
1399
  }
1382
1400
  };
1383
1401
 
1384
- template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1385
- struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1386
- {
1387
-
1388
- EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1389
- {
1390
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1391
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1392
- traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394
- traits.madd(*A0, *B_0, *C0, *B_0);
1395
- traits.madd(*A0, *B1, *C1, *B1);
1396
- traits.madd(*A0, *B2, *C2, *B2);
1397
- traits.madd(*A0, *B3, *C3, *B3);
1398
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1402
+ template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
1403
+ typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
1404
+ typename LinearMapper, typename DataMapper>
1405
+ struct lhs_process_fraction_of_packet
1406
+ : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
1407
+ RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
1408
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
1409
+ LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
1410
+ AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
1411
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1412
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1413
+ traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
1414
+ traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
1415
+ traits.madd(*A0, *B_0, *C0, *B_0);
1416
+ traits.madd(*A0, *B1, *C1, *B1);
1417
+ traits.madd(*A0, *B2, *C2, *B2);
1418
+ traits.madd(*A0, *B3, *C3, *B3);
1419
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1399
1420
  }
1400
1421
  };
1401
1422
 
1402
- template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1403
- EIGEN_DONT_INLINE
1404
- void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
1405
- ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1406
- Index rows, Index depth, Index cols, ResScalar alpha,
1407
- Index strideA, Index strideB, Index offsetA, Index offsetB)
1408
- {
1409
- Traits traits;
1410
- SwappedTraits straits;
1411
-
1412
- if(strideA==-1) strideA = depth;
1413
- if(strideB==-1) strideB = depth;
1414
- conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
1415
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1416
- const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417
- const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418
- const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419
- const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420
- const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1421
- enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1422
- const Index peeled_kc = depth & ~(pk-1);
1423
- const int prefetch_res_offset = 32/sizeof(ResScalar);
1424
- // const Index depth2 = depth & ~1;
1425
-
1426
- //---------- Process 3 * LhsProgress rows at once ----------
1427
- // This corresponds to 3*LhsProgress x nr register blocks.
1428
- // Usually, make sense only with FMA
1429
- if(mr>=3*Traits::LhsProgress)
1430
- {
1431
- // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1432
- // and on each largest micro vertical panel of the rhs (depth * nr).
1433
- // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1434
- // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1435
- // This actual number of rows is computed as follow:
1436
- const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1437
- // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1438
- // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1439
- // or because we are testing specific blocking sizes.
1440
- const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1441
- for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1442
- {
1443
- const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1444
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1445
- {
1446
- for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1447
- {
1448
-
1423
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1424
+ bool ConjugateLhs, bool ConjugateRhs>
1425
+ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs,
1426
+ ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
1427
+ const RhsScalar* blockB, Index rows, Index depth,
1428
+ Index cols, ResScalar alpha, Index strideA, Index strideB,
1429
+ Index offsetA, Index offsetB) {
1430
+ Traits traits;
1431
+ SwappedTraits straits;
1432
+
1433
+ if (strideA == -1) strideA = depth;
1434
+ if (strideB == -1) strideB = depth;
1435
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
1436
+ Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
1437
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1438
+ const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
1439
+ const Index peeled_mc2 =
1440
+ mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
1441
+ const Index peeled_mc1 =
1442
+ mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
1443
+ const Index peeled_mc_half =
1444
+ mr >= LhsProgressHalf ? peeled_mc1 + ((rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
1445
+ const Index peeled_mc_quarter =
1446
+ mr >= LhsProgressQuarter
1447
+ ? peeled_mc_half + ((rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
1448
+ : 0;
1449
+ enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1450
+ const Index peeled_kc = depth & ~(pk - 1);
1451
+ const int prefetch_res_offset = 32 / sizeof(ResScalar);
1452
+ // const Index depth2 = depth & ~1;
1453
+
1454
+ //---------- Process 3 * LhsProgress rows at once ----------
1455
+ // This corresponds to 3*LhsProgress x nr register blocks.
1456
+ // Usually, make sense only with FMA
1457
+ if (mr >= 3 * Traits::LhsProgress) {
1458
+ // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
1459
+ // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
1460
+ // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
1461
+ // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
1462
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1463
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1464
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1465
+ // guess), or because we are testing specific blocking sizes.
1466
+ const Index actual_panel_rows =
1467
+ (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1468
+ (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
1469
+ for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
1470
+ const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
1471
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1472
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1473
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1474
+ for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1475
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1476
+ prefetch(&blA[0]);
1477
+ // gets res block as register
1478
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
1479
+ C21, C22, C23;
1480
+ traits.initAcc(C0);
1481
+ traits.initAcc(C1);
1482
+ traits.initAcc(C2);
1483
+ traits.initAcc(C3);
1484
+ traits.initAcc(C4);
1485
+ traits.initAcc(C5);
1486
+ traits.initAcc(C6);
1487
+ traits.initAcc(C7);
1488
+ traits.initAcc(C8);
1489
+ traits.initAcc(C9);
1490
+ traits.initAcc(C10);
1491
+ traits.initAcc(C11);
1492
+ traits.initAcc(C12);
1493
+ traits.initAcc(C13);
1494
+ traits.initAcc(C14);
1495
+ traits.initAcc(C15);
1496
+ traits.initAcc(C16);
1497
+ traits.initAcc(C17);
1498
+ traits.initAcc(C18);
1499
+ traits.initAcc(C19);
1500
+ traits.initAcc(C20);
1501
+ traits.initAcc(C21);
1502
+ traits.initAcc(C22);
1503
+ traits.initAcc(C23);
1504
+
1505
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1506
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1507
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1508
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1509
+ LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1510
+ LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1511
+ LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1512
+ LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1513
+
1514
+ r0.prefetch(0);
1515
+ r1.prefetch(0);
1516
+ r2.prefetch(0);
1517
+ r3.prefetch(0);
1518
+ r4.prefetch(0);
1519
+ r5.prefetch(0);
1520
+ r6.prefetch(0);
1521
+ r7.prefetch(0);
1522
+
1523
+ // performs "inner" products
1524
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1525
+ prefetch(&blB[0]);
1526
+ LhsPacket A0, A1;
1527
+ for (Index k = 0; k < peeled_kc; k += pk) {
1528
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
1529
+ // 27 registers are taken (24 for acc, 3 for lhs).
1530
+ RhsPanel27 rhs_panel;
1531
+ RhsPacket T0;
1532
+ LhsPacket A2;
1533
+ #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1534
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1535
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1536
+ // which is not good for pipelining
1537
+ #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1538
+ #else
1539
+ #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1540
+ #endif
1541
+
1542
+ #define EIGEN_GEBP_ONESTEP(K) \
1543
+ do { \
1544
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1545
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1546
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1547
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1548
+ EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1549
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1550
+ traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1551
+ traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1552
+ traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1553
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1554
+ traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1555
+ traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1556
+ traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1557
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1558
+ traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1559
+ traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1560
+ traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1561
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1562
+ traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1563
+ traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1564
+ traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1565
+ traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1566
+ traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1567
+ traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1568
+ traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1569
+ traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1570
+ traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1571
+ traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1572
+ traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1573
+ traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1574
+ traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1575
+ traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1576
+ traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1577
+ traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1578
+ traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1579
+ traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1580
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1581
+ } while (false)
1582
+
1583
+ EIGEN_GEBP_ONESTEP(0);
1584
+ EIGEN_GEBP_ONESTEP(1);
1585
+ EIGEN_GEBP_ONESTEP(2);
1586
+ EIGEN_GEBP_ONESTEP(3);
1587
+ EIGEN_GEBP_ONESTEP(4);
1588
+ EIGEN_GEBP_ONESTEP(5);
1589
+ EIGEN_GEBP_ONESTEP(6);
1590
+ EIGEN_GEBP_ONESTEP(7);
1591
+
1592
+ blB += pk * 8 * RhsProgress;
1593
+ blA += pk * 3 * Traits::LhsProgress;
1594
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
1595
+ }
1596
+
1597
+ // process remaining peeled loop
1598
+ for (Index k = peeled_kc; k < depth; k++) {
1599
+ RhsPanel27 rhs_panel;
1600
+ RhsPacket T0;
1601
+ LhsPacket A2;
1602
+ EIGEN_GEBP_ONESTEP(0);
1603
+ blB += 8 * RhsProgress;
1604
+ blA += 3 * Traits::LhsProgress;
1605
+ }
1606
+
1607
+ #undef EIGEN_GEBP_ONESTEP
1608
+
1609
+ ResPacket R0, R1, R2;
1610
+ ResPacket alphav = pset1<ResPacket>(alpha);
1611
+
1612
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1613
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1614
+ R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1615
+ traits.acc(C0, alphav, R0);
1616
+ traits.acc(C8, alphav, R1);
1617
+ traits.acc(C16, alphav, R2);
1618
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
1619
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
1620
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
1621
+
1622
+ R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1623
+ R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1624
+ R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1625
+ traits.acc(C1, alphav, R0);
1626
+ traits.acc(C9, alphav, R1);
1627
+ traits.acc(C17, alphav, R2);
1628
+ r1.storePacket(0 * Traits::ResPacketSize, R0);
1629
+ r1.storePacket(1 * Traits::ResPacketSize, R1);
1630
+ r1.storePacket(2 * Traits::ResPacketSize, R2);
1631
+
1632
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1633
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1634
+ R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1635
+ traits.acc(C2, alphav, R0);
1636
+ traits.acc(C10, alphav, R1);
1637
+ traits.acc(C18, alphav, R2);
1638
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
1639
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
1640
+ r2.storePacket(2 * Traits::ResPacketSize, R2);
1641
+
1642
+ R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1643
+ R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1644
+ R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1645
+ traits.acc(C3, alphav, R0);
1646
+ traits.acc(C11, alphav, R1);
1647
+ traits.acc(C19, alphav, R2);
1648
+ r3.storePacket(0 * Traits::ResPacketSize, R0);
1649
+ r3.storePacket(1 * Traits::ResPacketSize, R1);
1650
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
1651
+
1652
+ R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1653
+ R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1654
+ R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1655
+ traits.acc(C4, alphav, R0);
1656
+ traits.acc(C12, alphav, R1);
1657
+ traits.acc(C20, alphav, R2);
1658
+ r4.storePacket(0 * Traits::ResPacketSize, R0);
1659
+ r4.storePacket(1 * Traits::ResPacketSize, R1);
1660
+ r4.storePacket(2 * Traits::ResPacketSize, R2);
1661
+
1662
+ R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1663
+ R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1664
+ R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1665
+ traits.acc(C5, alphav, R0);
1666
+ traits.acc(C13, alphav, R1);
1667
+ traits.acc(C21, alphav, R2);
1668
+ r5.storePacket(0 * Traits::ResPacketSize, R0);
1669
+ r5.storePacket(1 * Traits::ResPacketSize, R1);
1670
+ r5.storePacket(2 * Traits::ResPacketSize, R2);
1671
+
1672
+ R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1673
+ R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1674
+ R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1675
+ traits.acc(C6, alphav, R0);
1676
+ traits.acc(C14, alphav, R1);
1677
+ traits.acc(C22, alphav, R2);
1678
+ r6.storePacket(0 * Traits::ResPacketSize, R0);
1679
+ r6.storePacket(1 * Traits::ResPacketSize, R1);
1680
+ r6.storePacket(2 * Traits::ResPacketSize, R2);
1681
+
1682
+ R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1683
+ R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1684
+ R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1685
+ traits.acc(C7, alphav, R0);
1686
+ traits.acc(C15, alphav, R1);
1687
+ traits.acc(C23, alphav, R2);
1688
+ r7.storePacket(0 * Traits::ResPacketSize, R0);
1689
+ r7.storePacket(1 * Traits::ResPacketSize, R1);
1690
+ r7.storePacket(2 * Traits::ResPacketSize, R2);
1691
+ }
1692
+ }
1693
+ }
1694
+ #endif
1695
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1696
+ for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1449
1697
  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1450
1698
  // stored into 3 x nr registers.
1451
-
1452
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1699
+
1700
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1453
1701
  prefetch(&blA[0]);
1454
1702
 
1455
1703
  // gets res block as register
1456
- AccPacket C0, C1, C2, C3,
1457
- C4, C5, C6, C7,
1458
- C8, C9, C10, C11;
1459
- traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1460
- traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1461
- traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1704
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
1705
+ traits.initAcc(C0);
1706
+ traits.initAcc(C1);
1707
+ traits.initAcc(C2);
1708
+ traits.initAcc(C3);
1709
+ traits.initAcc(C4);
1710
+ traits.initAcc(C5);
1711
+ traits.initAcc(C6);
1712
+ traits.initAcc(C7);
1713
+ traits.initAcc(C8);
1714
+ traits.initAcc(C9);
1715
+ traits.initAcc(C10);
1716
+ traits.initAcc(C11);
1462
1717
 
1463
1718
  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1464
1719
  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
@@ -1471,55 +1726,54 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1471
1726
  r3.prefetch(0);
1472
1727
 
1473
1728
  // performs "inner" products
1474
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1729
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1475
1730
  prefetch(&blB[0]);
1476
1731
  LhsPacket A0, A1;
1477
1732
 
1478
- for(Index k=0; k<peeled_kc; k+=pk)
1479
- {
1733
+ for (Index k = 0; k < peeled_kc; k += pk) {
1480
1734
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1481
- // 15 registers are taken (12 for acc, 2 for lhs).
1735
+ // 15 registers are taken (12 for acc, 3 for lhs).
1482
1736
  RhsPanel15 rhs_panel;
1483
1737
  RhsPacket T0;
1484
1738
  LhsPacket A2;
1485
- #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1486
- // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1487
- // without this workaround A0, A1, and A2 are loaded in the same register,
1488
- // which is not good for pipelining
1489
- #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1490
- #else
1491
- #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1492
- #endif
1493
- #define EIGEN_GEBP_ONESTEP(K) \
1494
- do { \
1495
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497
- internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498
- if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499
- internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1500
- } /* Bug 953 */ \
1501
- traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502
- traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503
- traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504
- EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505
- traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506
- traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507
- traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508
- traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509
- traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510
- traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511
- traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512
- traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513
- traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514
- traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515
- traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516
- traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517
- traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518
- traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519
- traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520
- traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1522
- } while (false)
1739
+ #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1740
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1741
+ // without this workaround A0, A1, and A2 are loaded in the same register,
1742
+ // which is not good for pipelining
1743
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1744
+ #else
1745
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1746
+ #endif
1747
+ #define EIGEN_GEBP_ONESTEP(K) \
1748
+ do { \
1749
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1750
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1751
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1752
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1753
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1754
+ } /* Bug 953 */ \
1755
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1756
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1757
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1758
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1759
+ traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1760
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1761
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1762
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1763
+ traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1764
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1765
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1766
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1767
+ traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1768
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1769
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1770
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1771
+ traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1772
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1773
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1774
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1775
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1776
+ } while (false)
1523
1777
 
1524
1778
  internal::prefetch(blB);
1525
1779
  EIGEN_GEBP_ONESTEP(0);
@@ -1531,20 +1785,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1531
1785
  EIGEN_GEBP_ONESTEP(6);
1532
1786
  EIGEN_GEBP_ONESTEP(7);
1533
1787
 
1534
- blB += pk*4*RhsProgress;
1535
- blA += pk*3*Traits::LhsProgress;
1788
+ blB += pk * 4 * RhsProgress;
1789
+ blA += pk * 3 * Traits::LhsProgress;
1536
1790
 
1537
1791
  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1538
1792
  }
1539
1793
  // process remaining peeled loop
1540
- for(Index k=peeled_kc; k<depth; k++)
1541
- {
1794
+ for (Index k = peeled_kc; k < depth; k++) {
1542
1795
  RhsPanel15 rhs_panel;
1543
1796
  RhsPacket T0;
1544
1797
  LhsPacket A2;
1545
1798
  EIGEN_GEBP_ONESTEP(0);
1546
- blB += 4*RhsProgress;
1547
- blA += 3*Traits::LhsProgress;
1799
+ blB += 4 * RhsProgress;
1800
+ blA += 3 * Traits::LhsProgress;
1548
1801
  }
1549
1802
 
1550
1803
  #undef EIGEN_GEBP_ONESTEP
@@ -1590,17 +1843,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1590
1843
  traits.acc(C11, alphav, R2);
1591
1844
  r3.storePacket(0 * Traits::ResPacketSize, R0);
1592
1845
  r3.storePacket(1 * Traits::ResPacketSize, R1);
1593
- r3.storePacket(2 * Traits::ResPacketSize, R2);
1594
- }
1846
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
1595
1847
  }
1848
+ }
1596
1849
 
1597
- // Deal with remaining columns of the rhs
1598
- for(Index j2=packet_cols4; j2<cols; j2++)
1599
- {
1600
- for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1601
- {
1850
+ // Deal with remaining columns of the rhs
1851
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
1852
+ for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1602
1853
  // One column at a time
1603
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1854
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
1604
1855
  prefetch(&blA[0]);
1605
1856
 
1606
1857
  // gets res block as register
@@ -1613,26 +1864,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1613
1864
  r0.prefetch(0);
1614
1865
 
1615
1866
  // performs "inner" products
1616
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1867
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1617
1868
  LhsPacket A0, A1, A2;
1618
-
1619
- for(Index k=0; k<peeled_kc; k+=pk)
1620
- {
1869
+
1870
+ for (Index k = 0; k < peeled_kc; k += pk) {
1621
1871
  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1622
1872
  RhsPacket B_0;
1623
- #define EIGEN_GEBGP_ONESTEP(K) \
1624
- do { \
1625
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627
- traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628
- traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629
- traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630
- traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631
- traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632
- traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633
- traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1635
- } while (false)
1873
+ #define EIGEN_GEBGP_ONESTEP(K) \
1874
+ do { \
1875
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1876
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1877
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1878
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1879
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1880
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1881
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
1882
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
1883
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
1884
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1885
+ } while (false)
1636
1886
 
1637
1887
  EIGEN_GEBGP_ONESTEP(0);
1638
1888
  EIGEN_GEBGP_ONESTEP(1);
@@ -1650,12 +1900,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1650
1900
  }
1651
1901
 
1652
1902
  // process remaining peeled loop
1653
- for(Index k=peeled_kc; k<depth; k++)
1654
- {
1903
+ for (Index k = peeled_kc; k < depth; k++) {
1655
1904
  RhsPacket B_0;
1656
1905
  EIGEN_GEBGP_ONESTEP(0);
1657
1906
  blB += RhsProgress;
1658
- blA += 3*Traits::LhsProgress;
1907
+ blA += 3 * Traits::LhsProgress;
1659
1908
  }
1660
1909
  #undef EIGEN_GEBGP_ONESTEP
1661
1910
  ResPacket R0, R1, R2;
@@ -1669,40 +1918,214 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1669
1918
  traits.acc(C8, alphav, R2);
1670
1919
  r0.storePacket(0 * Traits::ResPacketSize, R0);
1671
1920
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1672
- r0.storePacket(2 * Traits::ResPacketSize, R2);
1673
- }
1921
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
1674
1922
  }
1675
1923
  }
1676
1924
  }
1925
+ }
1677
1926
 
1678
- //---------- Process 2 * LhsProgress rows at once ----------
1679
- if(mr>=2*Traits::LhsProgress)
1680
- {
1681
- const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1682
- // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1683
- // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1684
- // or because we are testing specific blocking sizes.
1685
- Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1927
+ //---------- Process 2 * LhsProgress rows at once ----------
1928
+ if (mr >= 2 * Traits::LhsProgress) {
1929
+ const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1930
+ // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1931
+ // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1932
+ // guess), or because we are testing specific blocking sizes.
1933
+ Index actual_panel_rows =
1934
+ (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1935
+ (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
1936
+
1937
+ for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
1938
+ Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
1939
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1940
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1941
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1942
+ for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1943
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1944
+ prefetch(&blA[0]);
1945
+
1946
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
1947
+ traits.initAcc(C0);
1948
+ traits.initAcc(C1);
1949
+ traits.initAcc(C2);
1950
+ traits.initAcc(C3);
1951
+ traits.initAcc(C4);
1952
+ traits.initAcc(C5);
1953
+ traits.initAcc(C6);
1954
+ traits.initAcc(C7);
1955
+ traits.initAcc(C8);
1956
+ traits.initAcc(C9);
1957
+ traits.initAcc(C10);
1958
+ traits.initAcc(C11);
1959
+ traits.initAcc(C12);
1960
+ traits.initAcc(C13);
1961
+ traits.initAcc(C14);
1962
+ traits.initAcc(C15);
1963
+
1964
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1965
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1966
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1967
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1968
+ LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1969
+ LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1970
+ LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1971
+ LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1972
+ r0.prefetch(prefetch_res_offset);
1973
+ r1.prefetch(prefetch_res_offset);
1974
+ r2.prefetch(prefetch_res_offset);
1975
+ r3.prefetch(prefetch_res_offset);
1976
+ r4.prefetch(prefetch_res_offset);
1977
+ r5.prefetch(prefetch_res_offset);
1978
+ r6.prefetch(prefetch_res_offset);
1979
+ r7.prefetch(prefetch_res_offset);
1980
+
1981
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1982
+ prefetch(&blB[0]);
1983
+ LhsPacket A0, A1;
1984
+ for (Index k = 0; k < peeled_kc; k += pk) {
1985
+ RhsPacketx4 rhs_panel;
1986
+ RhsPacket T0;
1987
+ // NOTE: the begin/end asm comments below work around bug 935!
1988
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
1989
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
1990
+ #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
1991
+ #else
1992
+ #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
1993
+ #endif
1994
+ #define EIGEN_GEBGP_ONESTEP(K) \
1995
+ do { \
1996
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
1997
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1998
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1999
+ traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
2000
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2001
+ traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2002
+ traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2003
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2004
+ traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2005
+ traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2006
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2007
+ traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2008
+ traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2009
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2010
+ traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2011
+ traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2012
+ traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2013
+ traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2014
+ traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2015
+ traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2016
+ traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2017
+ traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2018
+ traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2019
+ traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2020
+ traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2021
+ traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2022
+ traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2023
+ EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2024
+ } while (false)
2025
+
2026
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
2027
+
2028
+ EIGEN_GEBGP_ONESTEP(0);
2029
+ EIGEN_GEBGP_ONESTEP(1);
2030
+ EIGEN_GEBGP_ONESTEP(2);
2031
+ EIGEN_GEBGP_ONESTEP(3);
2032
+ EIGEN_GEBGP_ONESTEP(4);
2033
+ EIGEN_GEBGP_ONESTEP(5);
2034
+ EIGEN_GEBGP_ONESTEP(6);
2035
+ EIGEN_GEBGP_ONESTEP(7);
2036
+
2037
+ blB += pk * 8 * RhsProgress;
2038
+ blA += pk * (2 * Traits::LhsProgress);
2039
+
2040
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
2041
+ }
2042
+ // process remaining peeled loop
2043
+ for (Index k = peeled_kc; k < depth; k++) {
2044
+ RhsPacketx4 rhs_panel;
2045
+ RhsPacket T0;
2046
+ EIGEN_GEBGP_ONESTEP(0);
2047
+ blB += 8 * RhsProgress;
2048
+ blA += 2 * Traits::LhsProgress;
2049
+ }
1686
2050
 
1687
- for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1688
- {
1689
- Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1690
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1691
- {
1692
- for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1693
- {
1694
-
2051
+ #undef EIGEN_GEBGP_ONESTEP
2052
+
2053
+ ResPacket R0, R1, R2, R3;
2054
+ ResPacket alphav = pset1<ResPacket>(alpha);
2055
+
2056
+ R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2057
+ R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2058
+ R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2059
+ R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2060
+ traits.acc(C0, alphav, R0);
2061
+ traits.acc(C8, alphav, R1);
2062
+ traits.acc(C1, alphav, R2);
2063
+ traits.acc(C9, alphav, R3);
2064
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
2065
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
2066
+ r1.storePacket(0 * Traits::ResPacketSize, R2);
2067
+ r1.storePacket(1 * Traits::ResPacketSize, R3);
2068
+
2069
+ R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2070
+ R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2071
+ R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2072
+ R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2073
+ traits.acc(C2, alphav, R0);
2074
+ traits.acc(C10, alphav, R1);
2075
+ traits.acc(C3, alphav, R2);
2076
+ traits.acc(C11, alphav, R3);
2077
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
2078
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
2079
+ r3.storePacket(0 * Traits::ResPacketSize, R2);
2080
+ r3.storePacket(1 * Traits::ResPacketSize, R3);
2081
+
2082
+ R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2083
+ R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2084
+ R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2085
+ R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2086
+ traits.acc(C4, alphav, R0);
2087
+ traits.acc(C12, alphav, R1);
2088
+ traits.acc(C5, alphav, R2);
2089
+ traits.acc(C13, alphav, R3);
2090
+ r4.storePacket(0 * Traits::ResPacketSize, R0);
2091
+ r4.storePacket(1 * Traits::ResPacketSize, R1);
2092
+ r5.storePacket(0 * Traits::ResPacketSize, R2);
2093
+ r5.storePacket(1 * Traits::ResPacketSize, R3);
2094
+
2095
+ R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2096
+ R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2097
+ R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2098
+ R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2099
+ traits.acc(C6, alphav, R0);
2100
+ traits.acc(C14, alphav, R1);
2101
+ traits.acc(C7, alphav, R2);
2102
+ traits.acc(C15, alphav, R3);
2103
+ r6.storePacket(0 * Traits::ResPacketSize, R0);
2104
+ r6.storePacket(1 * Traits::ResPacketSize, R1);
2105
+ r7.storePacket(0 * Traits::ResPacketSize, R2);
2106
+ r7.storePacket(1 * Traits::ResPacketSize, R3);
2107
+ }
2108
+ }
2109
+ }
2110
+ #endif
2111
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2112
+ for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1695
2113
  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1696
2114
  // stored into 2 x nr registers.
1697
-
1698
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2115
+
2116
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1699
2117
  prefetch(&blA[0]);
1700
2118
 
1701
2119
  // gets res block as register
1702
- AccPacket C0, C1, C2, C3,
1703
- C4, C5, C6, C7;
1704
- traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1705
- traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
2120
+ AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
2121
+ traits.initAcc(C0);
2122
+ traits.initAcc(C1);
2123
+ traits.initAcc(C2);
2124
+ traits.initAcc(C3);
2125
+ traits.initAcc(C4);
2126
+ traits.initAcc(C5);
2127
+ traits.initAcc(C6);
2128
+ traits.initAcc(C7);
1706
2129
 
1707
2130
  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1708
2131
  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
@@ -1715,65 +2138,63 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1715
2138
  r3.prefetch(prefetch_res_offset);
1716
2139
 
1717
2140
  // performs "inner" products
1718
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
2141
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1719
2142
  prefetch(&blB[0]);
1720
2143
  LhsPacket A0, A1;
1721
2144
 
1722
- for(Index k=0; k<peeled_kc; k+=pk)
1723
- {
2145
+ for (Index k = 0; k < peeled_kc; k += pk) {
1724
2146
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1725
2147
  RhsPacketx4 rhs_panel;
1726
2148
  RhsPacket T0;
1727
2149
 
1728
- // NOTE: the begin/end asm comments below work around bug 935!
1729
- // but they are not enough for gcc>=6 without FMA (bug 1637)
1730
- #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731
- #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1732
- #else
1733
- #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1734
- #endif
1735
- #define EIGEN_GEBGP_ONESTEP(K) \
1736
- do { \
1737
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738
- traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739
- traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740
- traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741
- traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742
- traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743
- traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744
- traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745
- traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746
- traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747
- traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748
- traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749
- EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1751
- } while (false)
1752
-
1753
- internal::prefetch(blB+(48+0));
2150
+ // NOTE: the begin/end asm comments below work around bug 935!
2151
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
2152
+ #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2153
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
2154
+ #else
2155
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2156
+ #endif
2157
+ #define EIGEN_GEBGP_ONESTEP(K) \
2158
+ do { \
2159
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2160
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2161
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2162
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2163
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2164
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2165
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2166
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2167
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2168
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2169
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2170
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2171
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2172
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2173
+ } while (false)
2174
+
2175
+ internal::prefetch(blB + (48 + 0));
1754
2176
  EIGEN_GEBGP_ONESTEP(0);
1755
2177
  EIGEN_GEBGP_ONESTEP(1);
1756
2178
  EIGEN_GEBGP_ONESTEP(2);
1757
2179
  EIGEN_GEBGP_ONESTEP(3);
1758
- internal::prefetch(blB+(48+16));
2180
+ internal::prefetch(blB + (48 + 16));
1759
2181
  EIGEN_GEBGP_ONESTEP(4);
1760
2182
  EIGEN_GEBGP_ONESTEP(5);
1761
2183
  EIGEN_GEBGP_ONESTEP(6);
1762
2184
  EIGEN_GEBGP_ONESTEP(7);
1763
2185
 
1764
- blB += pk*4*RhsProgress;
1765
- blA += pk*(2*Traits::LhsProgress);
2186
+ blB += pk * 4 * RhsProgress;
2187
+ blA += pk * (2 * Traits::LhsProgress);
1766
2188
 
1767
2189
  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1768
2190
  }
1769
2191
  // process remaining peeled loop
1770
- for(Index k=peeled_kc; k<depth; k++)
1771
- {
2192
+ for (Index k = peeled_kc; k < depth; k++) {
1772
2193
  RhsPacketx4 rhs_panel;
1773
2194
  RhsPacket T0;
1774
2195
  EIGEN_GEBGP_ONESTEP(0);
1775
- blB += 4*RhsProgress;
1776
- blA += 2*Traits::LhsProgress;
2196
+ blB += 4 * RhsProgress;
2197
+ blA += 2 * Traits::LhsProgress;
1777
2198
  }
1778
2199
  #undef EIGEN_GEBGP_ONESTEP
1779
2200
 
@@ -1797,24 +2218,22 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1797
2218
  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798
2219
  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799
2220
  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800
- traits.acc(C2, alphav, R0);
1801
- traits.acc(C6, alphav, R1);
1802
- traits.acc(C3, alphav, R2);
1803
- traits.acc(C7, alphav, R3);
2221
+ traits.acc(C2, alphav, R0);
2222
+ traits.acc(C6, alphav, R1);
2223
+ traits.acc(C3, alphav, R2);
2224
+ traits.acc(C7, alphav, R3);
1804
2225
  r2.storePacket(0 * Traits::ResPacketSize, R0);
1805
2226
  r2.storePacket(1 * Traits::ResPacketSize, R1);
1806
2227
  r3.storePacket(0 * Traits::ResPacketSize, R2);
1807
2228
  r3.storePacket(1 * Traits::ResPacketSize, R3);
1808
- }
1809
2229
  }
1810
-
1811
- // Deal with remaining columns of the rhs
1812
- for(Index j2=packet_cols4; j2<cols; j2++)
1813
- {
1814
- for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1815
- {
2230
+ }
2231
+
2232
+ // Deal with remaining columns of the rhs
2233
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
2234
+ for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1816
2235
  // One column at a time
1817
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2236
+ const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1818
2237
  prefetch(&blA[0]);
1819
2238
 
1820
2239
  // gets res block as register
@@ -1826,26 +2245,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1826
2245
  r0.prefetch(prefetch_res_offset);
1827
2246
 
1828
2247
  // performs "inner" products
1829
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2248
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1830
2249
  LhsPacket A0, A1;
1831
2250
 
1832
- for(Index k=0; k<peeled_kc; k+=pk)
1833
- {
2251
+ for (Index k = 0; k < peeled_kc; k += pk) {
1834
2252
  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1835
2253
  RhsPacket B_0, B1;
1836
-
1837
- #define EIGEN_GEBGP_ONESTEP(K) \
1838
- do { \
1839
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844
- traits.madd(A0, B_0, C0, B1, fix<0>); \
1845
- traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1847
- } while(false)
1848
-
2254
+
2255
+ #define EIGEN_GEBGP_ONESTEP(K) \
2256
+ do { \
2257
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2258
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2259
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2260
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2261
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
2262
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
2263
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
2264
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2265
+ } while (false)
2266
+
1849
2267
  EIGEN_GEBGP_ONESTEP(0);
1850
2268
  EIGEN_GEBGP_ONESTEP(1);
1851
2269
  EIGEN_GEBGP_ONESTEP(2);
@@ -1862,12 +2280,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1862
2280
  }
1863
2281
 
1864
2282
  // process remaining peeled loop
1865
- for(Index k=peeled_kc; k<depth; k++)
1866
- {
2283
+ for (Index k = peeled_kc; k < depth; k++) {
1867
2284
  RhsPacket B_0, B1;
1868
2285
  EIGEN_GEBGP_ONESTEP(0);
1869
2286
  blB += RhsProgress;
1870
- blA += 2*Traits::LhsProgress;
2287
+ blA += 2 * Traits::LhsProgress;
1871
2288
  }
1872
2289
  #undef EIGEN_GEBGP_ONESTEP
1873
2290
  ResPacket R0, R1;
@@ -1879,197 +2296,252 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
1879
2296
  traits.acc(C4, alphav, R1);
1880
2297
  r0.storePacket(0 * Traits::ResPacketSize, R0);
1881
2298
  r0.storePacket(1 * Traits::ResPacketSize, R1);
1882
- }
1883
2299
  }
1884
2300
  }
1885
2301
  }
1886
- //---------- Process 1 * LhsProgress rows at once ----------
1887
- if(mr>=1*Traits::LhsProgress)
1888
- {
1889
- lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890
- p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1891
- }
1892
- //---------- Process LhsProgressHalf rows at once ----------
1893
- if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1894
- {
1895
- lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896
- p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1897
- }
1898
- //---------- Process LhsProgressQuarter rows at once ----------
1899
- if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1900
- {
1901
- lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902
- p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1903
- }
1904
- //---------- Process remaining rows, 1 at once ----------
1905
- if(peeled_mc_quarter<rows)
1906
- {
2302
+ }
2303
+ //---------- Process 1 * LhsProgress rows at once ----------
2304
+ if (mr >= 1 * Traits::LhsProgress) {
2305
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
2306
+ RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
2307
+ p;
2308
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2309
+ peeled_kc, pk, cols, depth, packet_cols4);
2310
+ }
2311
+ //---------- Process LhsProgressHalf rows at once ----------
2312
+ if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
2313
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
2314
+ LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
2315
+ p;
2316
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2317
+ peeled_kc, pk, cols, depth, packet_cols4);
2318
+ }
2319
+ //---------- Process LhsProgressQuarter rows at once ----------
2320
+ if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
2321
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
2322
+ AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter,
2323
+ QuarterTraits, LinearMapper, DataMapper>
2324
+ p;
2325
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
2326
+ prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2327
+ }
2328
+ //---------- Process remaining rows, 1 at once ----------
2329
+ if (peeled_mc_quarter < rows) {
2330
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2331
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
1907
2332
  // loop on each panel of the rhs
1908
- for(Index j2=0; j2<packet_cols4; j2+=nr)
1909
- {
2333
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1910
2334
  // loop on each row of the lhs (1*LhsProgress x depth)
1911
- for(Index i=peeled_mc_quarter; i<rows; i+=1)
1912
- {
1913
- const LhsScalar* blA = &blockA[i*strideA+offsetA];
2335
+ for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2336
+ const LhsScalar* blA = &blockA[i * strideA + offsetA];
1914
2337
  prefetch(&blA[0]);
1915
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1916
-
1917
- // If LhsProgress is 8 or 16, it assumes that there is a
1918
- // half or quarter packet, respectively, of the same size as
1919
- // nr (which is currently 4) for the return type.
1920
- const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1921
- const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
1922
- if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923
- (SwappedTraits::LhsProgress<=16) &&
1924
- (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925
- (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1926
- {
1927
- SAccPacket C0, C1, C2, C3;
1928
- straits.initAcc(C0);
1929
- straits.initAcc(C1);
1930
- straits.initAcc(C2);
1931
- straits.initAcc(C3);
1932
-
1933
- const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1934
- const Index endk = (depth/spk)*spk;
1935
- const Index endk4 = (depth/(spk*4))*(spk*4);
1936
-
1937
- Index k=0;
1938
- for(; k<endk4; k+=4*spk)
1939
- {
1940
- SLhsPacket A0,A1;
1941
- SRhsPacket B_0,B_1;
1942
-
1943
- straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1944
- straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1945
-
1946
- straits.loadRhsQuad(blA+0*spk, B_0);
1947
- straits.loadRhsQuad(blA+1*spk, B_1);
1948
- straits.madd(A0,B_0,C0,B_0, fix<0>);
1949
- straits.madd(A1,B_1,C1,B_1, fix<0>);
1950
-
1951
- straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1952
- straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1953
- straits.loadRhsQuad(blA+2*spk, B_0);
1954
- straits.loadRhsQuad(blA+3*spk, B_1);
1955
- straits.madd(A0,B_0,C2,B_0, fix<0>);
1956
- straits.madd(A1,B_1,C3,B_1, fix<0>);
1957
-
1958
- blB += 4*SwappedTraits::LhsProgress;
1959
- blA += 4*spk;
1960
- }
1961
- C0 = padd(padd(C0,C1),padd(C2,C3));
1962
- for(; k<endk; k+=spk)
1963
- {
1964
- SLhsPacket A0;
1965
- SRhsPacket B_0;
1966
-
1967
- straits.loadLhsUnaligned(blB, A0);
1968
- straits.loadRhsQuad(blA, B_0);
1969
- straits.madd(A0,B_0,C0,B_0, fix<0>);
1970
-
1971
- blB += SwappedTraits::LhsProgress;
1972
- blA += spk;
1973
- }
1974
- if(SwappedTraits::LhsProgress==8)
1975
- {
1976
- // Special case where we have to first reduce the accumulation register C0
1977
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1978
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1979
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1980
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1981
-
1982
- SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1983
- SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1984
-
1985
- if(depth-endk>0)
1986
- {
1987
- // We have to handle the last row of the rhs which corresponds to a half-packet
1988
- SLhsPacketHalf a0;
1989
- SRhsPacketHalf b0;
1990
- straits.loadLhsUnaligned(blB, a0);
1991
- straits.loadRhs(blA, b0);
1992
- SAccPacketHalf c0 = predux_half_dowto4(C0);
1993
- straits.madd(a0,b0,c0,b0, fix<0>);
1994
- straits.acc(c0, alphav, R);
1995
- }
1996
- else
1997
- {
1998
- straits.acc(predux_half_dowto4(C0), alphav, R);
1999
- }
2000
- res.scatterPacket(i, j2, R);
2001
- }
2002
- else if (SwappedTraits::LhsProgress==16)
2003
- {
2004
- // Special case where we have to first reduce the
2005
- // accumulation register C0. We specialize the block in
2006
- // template form, so that LhsProgress < 16 paths don't
2007
- // fail to compile
2008
- last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2009
- p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2010
- }
2011
- else
2012
- {
2013
- SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2014
- SResPacket alphav = pset1<SResPacket>(alpha);
2015
- straits.acc(C0, alphav, R);
2016
- res.scatterPacket(i, j2, R);
2017
- }
2018
- }
2019
- else // scalar path
2020
- {
2021
- // get a 1 x 4 res block as registers
2022
- ResScalar C0(0), C1(0), C2(0), C3(0);
2338
+ // gets a 1 x 1 res block as registers
2339
+ ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
2340
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
2341
+ for (Index k = 0; k < depth; k++) {
2342
+ LhsScalar A0 = blA[k];
2343
+ RhsScalar B_0;
2023
2344
 
2024
- for(Index k=0; k<depth; k++)
2025
- {
2026
- LhsScalar A0;
2027
- RhsScalar B_0, B_1;
2345
+ B_0 = blB[0];
2346
+ C0 = cj.pmadd(A0, B_0, C0);
2028
2347
 
2029
- A0 = blA[k];
2348
+ B_0 = blB[1];
2349
+ C1 = cj.pmadd(A0, B_0, C1);
2030
2350
 
2031
- B_0 = blB[0];
2032
- B_1 = blB[1];
2033
- C0 = cj.pmadd(A0,B_0,C0);
2034
- C1 = cj.pmadd(A0,B_1,C1);
2351
+ B_0 = blB[2];
2352
+ C2 = cj.pmadd(A0, B_0, C2);
2035
2353
 
2036
- B_0 = blB[2];
2037
- B_1 = blB[3];
2038
- C2 = cj.pmadd(A0,B_0,C2);
2039
- C3 = cj.pmadd(A0,B_1,C3);
2354
+ B_0 = blB[3];
2355
+ C3 = cj.pmadd(A0, B_0, C3);
2040
2356
 
2041
- blB += 4;
2042
- }
2043
- res(i, j2 + 0) += alpha * C0;
2044
- res(i, j2 + 1) += alpha * C1;
2045
- res(i, j2 + 2) += alpha * C2;
2046
- res(i, j2 + 3) += alpha * C3;
2357
+ B_0 = blB[4];
2358
+ C4 = cj.pmadd(A0, B_0, C4);
2359
+
2360
+ B_0 = blB[5];
2361
+ C5 = cj.pmadd(A0, B_0, C5);
2362
+
2363
+ B_0 = blB[6];
2364
+ C6 = cj.pmadd(A0, B_0, C6);
2365
+
2366
+ B_0 = blB[7];
2367
+ C7 = cj.pmadd(A0, B_0, C7);
2368
+
2369
+ blB += 8;
2047
2370
  }
2371
+ res(i, j2 + 0) += alpha * C0;
2372
+ res(i, j2 + 1) += alpha * C1;
2373
+ res(i, j2 + 2) += alpha * C2;
2374
+ res(i, j2 + 3) += alpha * C3;
2375
+ res(i, j2 + 4) += alpha * C4;
2376
+ res(i, j2 + 5) += alpha * C5;
2377
+ res(i, j2 + 6) += alpha * C6;
2378
+ res(i, j2 + 7) += alpha * C7;
2048
2379
  }
2049
2380
  }
2050
- // remaining columns
2051
- for(Index j2=packet_cols4; j2<cols; j2++)
2052
- {
2053
- // loop on each row of the lhs (1*LhsProgress x depth)
2054
- for(Index i=peeled_mc_quarter; i<rows; i+=1)
2381
+ }
2382
+ #endif
2383
+
2384
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2385
+ // loop on each row of the lhs (1*LhsProgress x depth)
2386
+ for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2387
+ const LhsScalar* blA = &blockA[i * strideA + offsetA];
2388
+ prefetch(&blA[0]);
2389
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2390
+
2391
+ // If LhsProgress is 8 or 16, it assumes that there is a
2392
+ // half or quarter packet, respectively, of the same size as
2393
+ // nr (which is currently 4) for the return type.
2394
+ const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
2395
+ const int SResPacketQuarterSize =
2396
+ unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
2397
+ // The following code assumes we can load SRhsPacket in such a way that
2398
+ // it multiplies blocks of 4 elements in SLhsPacket. This is not the
2399
+ // case for some customized kernels (i.e. NEON fp16). If the assumption
2400
+ // fails, drop down to the scalar path.
2401
+ constexpr bool kCanLoadSRhsQuad =
2402
+ (unpacket_traits<SLhsPacket>::size < 4) ||
2403
+ (unpacket_traits<SRhsPacket>::size % ((std::max<int>)(unpacket_traits<SLhsPacket>::size, 4) / 4)) == 0;
2404
+ if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
2405
+ (SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
2406
+ (SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
2407
+ SAccPacket C0, C1, C2, C3;
2408
+ straits.initAcc(C0);
2409
+ straits.initAcc(C1);
2410
+ straits.initAcc(C2);
2411
+ straits.initAcc(C3);
2412
+
2413
+ const Index spk = (std::max)(1, SwappedTraits::LhsProgress / 4);
2414
+ const Index endk = (depth / spk) * spk;
2415
+ const Index endk4 = (depth / (spk * 4)) * (spk * 4);
2416
+
2417
+ Index k = 0;
2418
+ for (; k < endk4; k += 4 * spk) {
2419
+ SLhsPacket A0, A1;
2420
+ SRhsPacket B_0, B_1;
2421
+
2422
+ straits.loadLhsUnaligned(blB + 0 * SwappedTraits::LhsProgress, A0);
2423
+ straits.loadLhsUnaligned(blB + 1 * SwappedTraits::LhsProgress, A1);
2424
+
2425
+ straits.loadRhsQuad(blA + 0 * spk, B_0);
2426
+ straits.loadRhsQuad(blA + 1 * spk, B_1);
2427
+ straits.madd(A0, B_0, C0, B_0, fix<0>);
2428
+ straits.madd(A1, B_1, C1, B_1, fix<0>);
2429
+
2430
+ straits.loadLhsUnaligned(blB + 2 * SwappedTraits::LhsProgress, A0);
2431
+ straits.loadLhsUnaligned(blB + 3 * SwappedTraits::LhsProgress, A1);
2432
+ straits.loadRhsQuad(blA + 2 * spk, B_0);
2433
+ straits.loadRhsQuad(blA + 3 * spk, B_1);
2434
+ straits.madd(A0, B_0, C2, B_0, fix<0>);
2435
+ straits.madd(A1, B_1, C3, B_1, fix<0>);
2436
+
2437
+ blB += 4 * SwappedTraits::LhsProgress;
2438
+ blA += 4 * spk;
2439
+ }
2440
+ C0 = padd(padd(C0, C1), padd(C2, C3));
2441
+ for (; k < endk; k += spk) {
2442
+ SLhsPacket A0;
2443
+ SRhsPacket B_0;
2444
+
2445
+ straits.loadLhsUnaligned(blB, A0);
2446
+ straits.loadRhsQuad(blA, B_0);
2447
+ straits.madd(A0, B_0, C0, B_0, fix<0>);
2448
+
2449
+ blB += SwappedTraits::LhsProgress;
2450
+ blA += spk;
2451
+ }
2452
+ if (SwappedTraits::LhsProgress == 8) {
2453
+ // Special case where we have to first reduce the accumulation register C0
2454
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SResPacket>::half,
2455
+ SResPacket>
2456
+ SResPacketHalf;
2457
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SLhsPacket>::half,
2458
+ SLhsPacket>
2459
+ SLhsPacketHalf;
2460
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SRhsPacket>::half,
2461
+ SRhsPacket>
2462
+ SRhsPacketHalf;
2463
+ typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SAccPacket>::half,
2464
+ SAccPacket>
2465
+ SAccPacketHalf;
2466
+
2467
+ SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2468
+ SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2469
+
2470
+ if (depth - endk > 0) {
2471
+ // We have to handle the last row of the rhs which corresponds to a half-packet
2472
+ SLhsPacketHalf a0;
2473
+ SRhsPacketHalf b0;
2474
+ straits.loadLhsUnaligned(blB, a0);
2475
+ straits.loadRhs(blA, b0);
2476
+ SAccPacketHalf c0 = predux_half_dowto4(C0);
2477
+ straits.madd(a0, b0, c0, b0, fix<0>);
2478
+ straits.acc(c0, alphav, R);
2479
+ } else {
2480
+ straits.acc(predux_half_dowto4(C0), alphav, R);
2481
+ }
2482
+ res.scatterPacket(i, j2, R);
2483
+ } else if (SwappedTraits::LhsProgress == 16) {
2484
+ // Special case where we have to first reduce the
2485
+ // accumulation register C0. We specialize the block in
2486
+ // template form, so that LhsProgress < 16 paths don't
2487
+ // fail to compile
2488
+ last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2489
+ p(res, straits, blA, blB, depth, endk, i, j2, alpha, C0);
2490
+ } else {
2491
+ SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2492
+ SResPacket alphav = pset1<SResPacket>(alpha);
2493
+ straits.acc(C0, alphav, R);
2494
+ res.scatterPacket(i, j2, R);
2495
+ }
2496
+ } else // scalar path
2055
2497
  {
2056
- const LhsScalar* blA = &blockA[i*strideA+offsetA];
2057
- prefetch(&blA[0]);
2058
- // gets a 1 x 1 res block as registers
2059
- ResScalar C0(0);
2060
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061
- for(Index k=0; k<depth; k++)
2062
- {
2063
- LhsScalar A0 = blA[k];
2064
- RhsScalar B_0 = blB[k];
2498
+ // get a 1 x 4 res block as registers
2499
+ ResScalar C0(0), C1(0), C2(0), C3(0);
2500
+
2501
+ for (Index k = 0; k < depth; k++) {
2502
+ LhsScalar A0;
2503
+ RhsScalar B_0, B_1;
2504
+
2505
+ A0 = blA[k];
2506
+
2507
+ B_0 = blB[0];
2508
+ B_1 = blB[1];
2065
2509
  C0 = cj.pmadd(A0, B_0, C0);
2510
+ C1 = cj.pmadd(A0, B_1, C1);
2511
+
2512
+ B_0 = blB[2];
2513
+ B_1 = blB[3];
2514
+ C2 = cj.pmadd(A0, B_0, C2);
2515
+ C3 = cj.pmadd(A0, B_1, C3);
2516
+
2517
+ blB += 4;
2066
2518
  }
2067
- res(i, j2) += alpha * C0;
2519
+ res(i, j2 + 0) += alpha * C0;
2520
+ res(i, j2 + 1) += alpha * C1;
2521
+ res(i, j2 + 2) += alpha * C2;
2522
+ res(i, j2 + 3) += alpha * C3;
2068
2523
  }
2069
2524
  }
2070
2525
  }
2526
+ // remaining columns
2527
+ for (Index j2 = packet_cols4; j2 < cols; j2++) {
2528
+ // loop on each row of the lhs (1*LhsProgress x depth)
2529
+ for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2530
+ const LhsScalar* blA = &blockA[i * strideA + offsetA];
2531
+ prefetch(&blA[0]);
2532
+ // gets a 1 x 1 res block as registers
2533
+ ResScalar C0(0);
2534
+ const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2535
+ for (Index k = 0; k < depth; k++) {
2536
+ LhsScalar A0 = blA[k];
2537
+ RhsScalar B_0 = blB[k];
2538
+ C0 = cj.pmadd(A0, B_0, C0);
2539
+ }
2540
+ res(i, j2) += alpha * C0;
2541
+ }
2542
+ }
2071
2543
  }
2072
-
2544
+ }
2073
2545
 
2074
2546
  // pack a block of the lhs
2075
2547
  // The traversal is as follow (mr==4):
@@ -2085,131 +2557,129 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
2085
2557
  //
2086
2558
  // 32 33 34 35 ...
2087
2559
  // 36 36 38 39 ...
2088
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2089
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2090
- {
2560
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2561
+ bool PanelMode>
2562
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
2091
2563
  typedef typename DataMapper::LinearMapper LinearMapper;
2092
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2564
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
2565
+ Index offset = 0);
2093
2566
  };
2094
2567
 
2095
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2096
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2097
- ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2098
- {
2568
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2569
+ bool PanelMode>
2570
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
2571
+ PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
2572
+ Index rows, Index stride, Index offset) {
2099
2573
  typedef typename unpacket_traits<Packet>::half HalfPacket;
2100
2574
  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101
- enum { PacketSize = unpacket_traits<Packet>::size,
2102
- HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103
- QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104
- HasHalf = (int)HalfPacketSize < (int)PacketSize,
2105
- HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2575
+ enum {
2576
+ PacketSize = unpacket_traits<Packet>::size,
2577
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2578
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2579
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2580
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
2581
+ };
2106
2582
 
2107
2583
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2108
2584
  EIGEN_UNUSED_VARIABLE(stride);
2109
2585
  EIGEN_UNUSED_VARIABLE(offset);
2110
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2111
- eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2586
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2587
+ eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
2112
2588
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2113
2589
  Index count = 0;
2114
2590
 
2115
- const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2116
- const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117
- const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118
- const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119
- const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2591
+ const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
2592
+ const Index peeled_mc2 =
2593
+ Pack1 >= 2 * PacketSize ? peeled_mc3 + ((rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
2594
+ const Index peeled_mc1 =
2595
+ Pack1 >= 1 * PacketSize ? peeled_mc2 + ((rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
2596
+ const Index peeled_mc_half =
2597
+ Pack1 >= HalfPacketSize ? peeled_mc1 + ((rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
2598
+ const Index peeled_mc_quarter = Pack1 >= QuarterPacketSize ? (rows / (QuarterPacketSize)) * (QuarterPacketSize) : 0;
2120
2599
  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121
- const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122
- : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2600
+ const Index peeled_mc0 = Pack2 >= PacketSize ? peeled_mc_quarter
2601
+ : Pack2 > 1 && last_lhs_progress ? (rows / last_lhs_progress) * last_lhs_progress
2602
+ : 0;
2123
2603
 
2124
- Index i=0;
2604
+ Index i = 0;
2125
2605
 
2126
2606
  // Pack 3 packets
2127
- if(Pack1>=3*PacketSize)
2128
- {
2129
- for(; i<peeled_mc3; i+=3*PacketSize)
2130
- {
2131
- if(PanelMode) count += (3*PacketSize) * offset;
2607
+ if (Pack1 >= 3 * PacketSize) {
2608
+ for (; i < peeled_mc3; i += 3 * PacketSize) {
2609
+ if (PanelMode) count += (3 * PacketSize) * offset;
2132
2610
 
2133
- for(Index k=0; k<depth; k++)
2134
- {
2611
+ for (Index k = 0; k < depth; k++) {
2135
2612
  Packet A, B, C;
2136
- A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137
- B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138
- C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2139
- pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140
- pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141
- pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2613
+ A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2614
+ B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
2615
+ C = lhs.template loadPacket<Packet>(i + 2 * PacketSize, k);
2616
+ pstore(blockA + count, cj.pconj(A));
2617
+ count += PacketSize;
2618
+ pstore(blockA + count, cj.pconj(B));
2619
+ count += PacketSize;
2620
+ pstore(blockA + count, cj.pconj(C));
2621
+ count += PacketSize;
2142
2622
  }
2143
- if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2623
+ if (PanelMode) count += (3 * PacketSize) * (stride - offset - depth);
2144
2624
  }
2145
2625
  }
2146
2626
  // Pack 2 packets
2147
- if(Pack1>=2*PacketSize)
2148
- {
2149
- for(; i<peeled_mc2; i+=2*PacketSize)
2150
- {
2151
- if(PanelMode) count += (2*PacketSize) * offset;
2627
+ if (Pack1 >= 2 * PacketSize) {
2628
+ for (; i < peeled_mc2; i += 2 * PacketSize) {
2629
+ if (PanelMode) count += (2 * PacketSize) * offset;
2152
2630
 
2153
- for(Index k=0; k<depth; k++)
2154
- {
2631
+ for (Index k = 0; k < depth; k++) {
2155
2632
  Packet A, B;
2156
- A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157
- B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158
- pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159
- pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2633
+ A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2634
+ B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
2635
+ pstore(blockA + count, cj.pconj(A));
2636
+ count += PacketSize;
2637
+ pstore(blockA + count, cj.pconj(B));
2638
+ count += PacketSize;
2160
2639
  }
2161
- if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2640
+ if (PanelMode) count += (2 * PacketSize) * (stride - offset - depth);
2162
2641
  }
2163
2642
  }
2164
2643
  // Pack 1 packets
2165
- if(Pack1>=1*PacketSize)
2166
- {
2167
- for(; i<peeled_mc1; i+=1*PacketSize)
2168
- {
2169
- if(PanelMode) count += (1*PacketSize) * offset;
2644
+ if (Pack1 >= 1 * PacketSize) {
2645
+ for (; i < peeled_mc1; i += 1 * PacketSize) {
2646
+ if (PanelMode) count += (1 * PacketSize) * offset;
2170
2647
 
2171
- for(Index k=0; k<depth; k++)
2172
- {
2648
+ for (Index k = 0; k < depth; k++) {
2173
2649
  Packet A;
2174
- A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175
- pstore(blockA+count, cj.pconj(A));
2176
- count+=PacketSize;
2650
+ A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2651
+ pstore(blockA + count, cj.pconj(A));
2652
+ count += PacketSize;
2177
2653
  }
2178
- if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2654
+ if (PanelMode) count += (1 * PacketSize) * (stride - offset - depth);
2179
2655
  }
2180
2656
  }
2181
2657
  // Pack half packets
2182
- if(HasHalf && Pack1>=HalfPacketSize)
2183
- {
2184
- for(; i<peeled_mc_half; i+=HalfPacketSize)
2185
- {
2186
- if(PanelMode) count += (HalfPacketSize) * offset;
2658
+ if (HasHalf && Pack1 >= HalfPacketSize) {
2659
+ for (; i < peeled_mc_half; i += HalfPacketSize) {
2660
+ if (PanelMode) count += (HalfPacketSize)*offset;
2187
2661
 
2188
- for(Index k=0; k<depth; k++)
2189
- {
2662
+ for (Index k = 0; k < depth; k++) {
2190
2663
  HalfPacket A;
2191
- A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192
- pstoreu(blockA+count, cj.pconj(A));
2193
- count+=HalfPacketSize;
2664
+ A = lhs.template loadPacket<HalfPacket>(i + 0 * (HalfPacketSize), k);
2665
+ pstoreu(blockA + count, cj.pconj(A));
2666
+ count += HalfPacketSize;
2194
2667
  }
2195
- if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2668
+ if (PanelMode) count += (HalfPacketSize) * (stride - offset - depth);
2196
2669
  }
2197
2670
  }
2198
2671
  // Pack quarter packets
2199
- if(HasQuarter && Pack1>=QuarterPacketSize)
2200
- {
2201
- for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2202
- {
2203
- if(PanelMode) count += (QuarterPacketSize) * offset;
2672
+ if (HasQuarter && Pack1 >= QuarterPacketSize) {
2673
+ for (; i < peeled_mc_quarter; i += QuarterPacketSize) {
2674
+ if (PanelMode) count += (QuarterPacketSize)*offset;
2204
2675
 
2205
- for(Index k=0; k<depth; k++)
2206
- {
2676
+ for (Index k = 0; k < depth; k++) {
2207
2677
  QuarterPacket A;
2208
- A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209
- pstoreu(blockA+count, cj.pconj(A));
2210
- count+=QuarterPacketSize;
2678
+ A = lhs.template loadPacket<QuarterPacket>(i + 0 * (QuarterPacketSize), k);
2679
+ pstoreu(blockA + count, cj.pconj(A));
2680
+ count += QuarterPacketSize;
2211
2681
  }
2212
- if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2682
+ if (PanelMode) count += (QuarterPacketSize) * (stride - offset - depth);
2213
2683
  }
2214
2684
  }
2215
2685
  // Pack2 may be *smaller* than PacketSize—that happens for
@@ -2218,128 +2688,118 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
2218
2688
  // address both real & imaginary parts on the rhs. This portion will
2219
2689
  // pack those half ones until they match the number expected on the
2220
2690
  // last peeling loop at this point (for the rhs).
2221
- if(Pack2<PacketSize && Pack2>1)
2222
- {
2223
- for(; i<peeled_mc0; i+=last_lhs_progress)
2224
- {
2225
- if(PanelMode) count += last_lhs_progress * offset;
2691
+ if (Pack2 < PacketSize && Pack2 > 1) {
2692
+ for (; i < peeled_mc0; i += last_lhs_progress) {
2693
+ if (PanelMode) count += last_lhs_progress * offset;
2226
2694
 
2227
- for(Index k=0; k<depth; k++)
2228
- for(Index w=0; w<last_lhs_progress; w++)
2229
- blockA[count++] = cj(lhs(i+w, k));
2695
+ for (Index k = 0; k < depth; k++)
2696
+ for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k));
2230
2697
 
2231
- if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2698
+ if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
2232
2699
  }
2233
2700
  }
2234
2701
  // Pack scalars
2235
- for(; i<rows; i++)
2236
- {
2237
- if(PanelMode) count += offset;
2238
- for(Index k=0; k<depth; k++)
2239
- blockA[count++] = cj(lhs(i, k));
2240
- if(PanelMode) count += (stride-offset-depth);
2702
+ for (; i < rows; i++) {
2703
+ if (PanelMode) count += offset;
2704
+ for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
2705
+ if (PanelMode) count += (stride - offset - depth);
2241
2706
  }
2242
2707
  }
2243
2708
 
2244
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2245
- struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2246
- {
2709
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2710
+ bool PanelMode>
2711
+ struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
2247
2712
  typedef typename DataMapper::LinearMapper LinearMapper;
2248
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2713
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
2714
+ Index offset = 0);
2249
2715
  };
2250
2716
 
2251
- template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2252
- EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2253
- ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2254
- {
2717
+ template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2718
+ bool PanelMode>
2719
+ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
2720
+ PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
2721
+ Index rows, Index stride, Index offset) {
2255
2722
  typedef typename unpacket_traits<Packet>::half HalfPacket;
2256
2723
  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257
- enum { PacketSize = unpacket_traits<Packet>::size,
2258
- HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259
- QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260
- HasHalf = (int)HalfPacketSize < (int)PacketSize,
2261
- HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2724
+ enum {
2725
+ PacketSize = unpacket_traits<Packet>::size,
2726
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
2727
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2728
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
2729
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
2730
+ };
2262
2731
 
2263
2732
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2264
2733
  EIGEN_UNUSED_VARIABLE(stride);
2265
2734
  EIGEN_UNUSED_VARIABLE(offset);
2266
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2735
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2267
2736
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2268
2737
  Index count = 0;
2269
2738
  bool gone_half = false, gone_quarter = false, gone_last = false;
2270
2739
 
2271
2740
  Index i = 0;
2272
- int pack = Pack1;
2273
- int psize = PacketSize;
2274
- while(pack>0)
2275
- {
2276
- Index remaining_rows = rows-i;
2277
- Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2741
+ Index pack = Pack1;
2742
+ Index psize = PacketSize;
2743
+ while (pack > 0) {
2744
+ Index remaining_rows = rows - i;
2745
+ Index peeled_mc = gone_last ? Pack2 > 1 ? (rows / pack) * pack : 0 : i + (remaining_rows / pack) * pack;
2278
2746
  Index starting_pos = i;
2279
- for(; i<peeled_mc; i+=pack)
2280
- {
2281
- if(PanelMode) count += pack * offset;
2282
-
2283
- Index k=0;
2284
- if(pack>=psize && psize >= QuarterPacketSize)
2285
- {
2286
- const Index peeled_k = (depth/psize)*psize;
2287
- for(; k<peeled_k; k+=psize)
2288
- {
2289
- for (Index m = 0; m < pack; m += psize)
2290
- {
2747
+ for (; i < peeled_mc; i += pack) {
2748
+ if (PanelMode) count += pack * offset;
2749
+
2750
+ Index k = 0;
2751
+ if (pack >= psize && psize >= QuarterPacketSize) {
2752
+ const Index peeled_k = (depth / psize) * psize;
2753
+ for (; k < peeled_k; k += psize) {
2754
+ for (Index m = 0; m < pack; m += psize) {
2291
2755
  if (psize == PacketSize) {
2292
2756
  PacketBlock<Packet> kernel;
2293
- for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2757
+ for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i + p + m, k);
2294
2758
  ptranspose(kernel);
2295
- for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2759
+ for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel.packet[p]));
2296
2760
  } else if (HasHalf && psize == HalfPacketSize) {
2297
2761
  gone_half = true;
2298
2762
  PacketBlock<HalfPacket> kernel_half;
2299
- for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2763
+ for (Index p = 0; p < psize; ++p)
2764
+ kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i + p + m, k);
2300
2765
  ptranspose(kernel_half);
2301
- for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2766
+ for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_half.packet[p]));
2302
2767
  } else if (HasQuarter && psize == QuarterPacketSize) {
2303
2768
  gone_quarter = true;
2304
2769
  PacketBlock<QuarterPacket> kernel_quarter;
2305
- for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2770
+ for (Index p = 0; p < psize; ++p)
2771
+ kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i + p + m, k);
2306
2772
  ptranspose(kernel_quarter);
2307
- for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2308
- }
2773
+ for (Index p = 0; p < psize; ++p)
2774
+ pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_quarter.packet[p]));
2775
+ }
2309
2776
  }
2310
- count += psize*pack;
2777
+ count += psize * pack;
2311
2778
  }
2312
2779
  }
2313
2780
 
2314
- for(; k<depth; k++)
2315
- {
2316
- Index w=0;
2317
- for(; w<pack-3; w+=4)
2318
- {
2319
- Scalar a(cj(lhs(i+w+0, k))),
2320
- b(cj(lhs(i+w+1, k))),
2321
- c(cj(lhs(i+w+2, k))),
2322
- d(cj(lhs(i+w+3, k)));
2781
+ for (; k < depth; k++) {
2782
+ Index w = 0;
2783
+ for (; w < pack - 3; w += 4) {
2784
+ Scalar a(cj(lhs(i + w + 0, k))), b(cj(lhs(i + w + 1, k))), c(cj(lhs(i + w + 2, k))), d(cj(lhs(i + w + 3, k)));
2323
2785
  blockA[count++] = a;
2324
2786
  blockA[count++] = b;
2325
2787
  blockA[count++] = c;
2326
2788
  blockA[count++] = d;
2327
2789
  }
2328
- if(pack%4)
2329
- for(;w<pack;++w)
2330
- blockA[count++] = cj(lhs(i+w, k));
2790
+ if (pack % 4)
2791
+ for (; w < pack; ++w) blockA[count++] = cj(lhs(i + w, k));
2331
2792
  }
2332
2793
 
2333
- if(PanelMode) count += pack * (stride-offset-depth);
2794
+ if (PanelMode) count += pack * (stride - offset - depth);
2334
2795
  }
2335
2796
 
2336
2797
  pack -= psize;
2337
2798
  Index left = rows - i;
2338
2799
  if (pack <= 0) {
2339
- if (!gone_last &&
2340
- (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341
- ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342
- (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2800
+ if (!gone_last && (starting_pos == i || left >= psize / 2 || left >= psize / 4) &&
2801
+ ((psize / 2 == HalfPacketSize && HasHalf && !gone_half) ||
2802
+ (psize / 2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2343
2803
  psize /= 2;
2344
2804
  pack = psize;
2345
2805
  continue;
@@ -2357,12 +2817,10 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
2357
2817
  }
2358
2818
  }
2359
2819
 
2360
- for(; i<rows; i++)
2361
- {
2362
- if(PanelMode) count += offset;
2363
- for(Index k=0; k<depth; k++)
2364
- blockA[count++] = cj(lhs(i, k));
2365
- if(PanelMode) count += (stride-offset-depth);
2820
+ for (; i < rows; i++) {
2821
+ if (PanelMode) count += offset;
2822
+ for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
2823
+ if (PanelMode) count += (stride - offset - depth);
2366
2824
  }
2367
2825
  }
2368
2826
 
@@ -2373,273 +2831,323 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
2373
2831
  // 4 5 6 7 16 17 18 19 25 28
2374
2832
  // 8 9 10 11 20 21 22 23 26 29
2375
2833
  // . . . . . . . . . .
2376
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2377
- struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2378
- {
2834
+ template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2835
+ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
2379
2836
  typedef typename packet_traits<Scalar>::type Packet;
2380
2837
  typedef typename DataMapper::LinearMapper LinearMapper;
2381
2838
  enum { PacketSize = packet_traits<Scalar>::size };
2382
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2839
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
2840
+ Index offset = 0);
2383
2841
  };
2384
2842
 
2385
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2386
- EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2387
- ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2388
- {
2843
+ template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2844
+ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
2845
+ Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
2389
2846
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2390
2847
  EIGEN_UNUSED_VARIABLE(stride);
2391
2848
  EIGEN_UNUSED_VARIABLE(offset);
2392
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2849
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2393
2850
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2394
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2395
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2851
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
2852
+ Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
2396
2853
  Index count = 0;
2397
- const Index peeled_k = (depth/PacketSize)*PacketSize;
2398
- // if(nr>=8)
2399
- // {
2400
- // for(Index j2=0; j2<packet_cols8; j2+=8)
2401
- // {
2402
- // // skip what we have before
2403
- // if(PanelMode) count += 8 * offset;
2404
- // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
2405
- // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
2406
- // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
2407
- // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
2408
- // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
2409
- // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
2410
- // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
2411
- // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
2412
- // Index k=0;
2413
- // if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
2414
- // {
2415
- // for(; k<peeled_k; k+=PacketSize) {
2416
- // PacketBlock<Packet> kernel;
2417
- // for (int p = 0; p < PacketSize; ++p) {
2418
- // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
2419
- // }
2420
- // ptranspose(kernel);
2421
- // for (int p = 0; p < PacketSize; ++p) {
2422
- // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
2423
- // count+=PacketSize;
2424
- // }
2425
- // }
2426
- // }
2427
- // for(; k<depth; k++)
2428
- // {
2429
- // blockB[count+0] = cj(b0[k]);
2430
- // blockB[count+1] = cj(b1[k]);
2431
- // blockB[count+2] = cj(b2[k]);
2432
- // blockB[count+3] = cj(b3[k]);
2433
- // blockB[count+4] = cj(b4[k]);
2434
- // blockB[count+5] = cj(b5[k]);
2435
- // blockB[count+6] = cj(b6[k]);
2436
- // blockB[count+7] = cj(b7[k]);
2437
- // count += 8;
2438
- // }
2439
- // // skip what we have after
2440
- // if(PanelMode) count += 8 * (stride-offset-depth);
2441
- // }
2442
- // }
2443
-
2444
- if(nr>=4)
2445
- {
2446
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2447
- {
2854
+ const Index peeled_k = (depth / PacketSize) * PacketSize;
2855
+
2856
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2857
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
2858
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2859
+ // skip what we have before
2860
+ if (PanelMode) count += 8 * offset;
2861
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2862
+ const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2863
+ const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2864
+ const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2865
+ const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
2866
+ const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
2867
+ const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
2868
+ const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
2869
+ Index k = 0;
2870
+ if (PacketSize % 2 == 0 && PacketSize <= 8) // 2 4 8
2871
+ {
2872
+ for (; k < peeled_k; k += PacketSize) {
2873
+ if (PacketSize == 2) {
2874
+ PacketBlock<Packet, PacketSize == 2 ? 2 : PacketSize> kernel0, kernel1, kernel2, kernel3;
2875
+ kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2876
+ kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2877
+ kernel1.packet[0 % PacketSize] = dm2.template loadPacket<Packet>(k);
2878
+ kernel1.packet[1 % PacketSize] = dm3.template loadPacket<Packet>(k);
2879
+ kernel2.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
2880
+ kernel2.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
2881
+ kernel3.packet[0 % PacketSize] = dm6.template loadPacket<Packet>(k);
2882
+ kernel3.packet[1 % PacketSize] = dm7.template loadPacket<Packet>(k);
2883
+ ptranspose(kernel0);
2884
+ ptranspose(kernel1);
2885
+ ptranspose(kernel2);
2886
+ ptranspose(kernel3);
2887
+
2888
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2889
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
2890
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
2891
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
2892
+
2893
+ pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2894
+ pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
2895
+ pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
2896
+ pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
2897
+ count += 8 * PacketSize;
2898
+ } else if (PacketSize == 4) {
2899
+ PacketBlock<Packet, PacketSize == 4 ? 4 : PacketSize> kernel0, kernel1;
2900
+
2901
+ kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2902
+ kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2903
+ kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2904
+ kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2905
+ kernel1.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
2906
+ kernel1.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
2907
+ kernel1.packet[2 % PacketSize] = dm6.template loadPacket<Packet>(k);
2908
+ kernel1.packet[3 % PacketSize] = dm7.template loadPacket<Packet>(k);
2909
+ ptranspose(kernel0);
2910
+ ptranspose(kernel1);
2911
+
2912
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2913
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
2914
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2915
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
2916
+ pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
2917
+ pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[2 % PacketSize]));
2918
+ pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
2919
+ pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel1.packet[3 % PacketSize]));
2920
+ count += 8 * PacketSize;
2921
+ } else if (PacketSize == 8) {
2922
+ PacketBlock<Packet, PacketSize == 8 ? 8 : PacketSize> kernel0;
2923
+
2924
+ kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2925
+ kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2926
+ kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2927
+ kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2928
+ kernel0.packet[4 % PacketSize] = dm4.template loadPacket<Packet>(k);
2929
+ kernel0.packet[5 % PacketSize] = dm5.template loadPacket<Packet>(k);
2930
+ kernel0.packet[6 % PacketSize] = dm6.template loadPacket<Packet>(k);
2931
+ kernel0.packet[7 % PacketSize] = dm7.template loadPacket<Packet>(k);
2932
+ ptranspose(kernel0);
2933
+
2934
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2935
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2936
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
2937
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
2938
+ pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[4 % PacketSize]));
2939
+ pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel0.packet[5 % PacketSize]));
2940
+ pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[6 % PacketSize]));
2941
+ pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel0.packet[7 % PacketSize]));
2942
+ count += 8 * PacketSize;
2943
+ }
2944
+ }
2945
+ }
2946
+
2947
+ for (; k < depth; k++) {
2948
+ blockB[count + 0] = cj(dm0(k));
2949
+ blockB[count + 1] = cj(dm1(k));
2950
+ blockB[count + 2] = cj(dm2(k));
2951
+ blockB[count + 3] = cj(dm3(k));
2952
+ blockB[count + 4] = cj(dm4(k));
2953
+ blockB[count + 5] = cj(dm5(k));
2954
+ blockB[count + 6] = cj(dm6(k));
2955
+ blockB[count + 7] = cj(dm7(k));
2956
+ count += 8;
2957
+ }
2958
+ // skip what we have after
2959
+ if (PanelMode) count += 8 * (stride - offset - depth);
2960
+ }
2961
+ }
2962
+ #endif
2963
+
2964
+ EIGEN_IF_CONSTEXPR(nr >= 4) {
2965
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2448
2966
  // skip what we have before
2449
- if(PanelMode) count += 4 * offset;
2967
+ if (PanelMode) count += 4 * offset;
2450
2968
  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451
2969
  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452
2970
  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453
2971
  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2454
2972
 
2455
- Index k=0;
2456
- if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2973
+ Index k = 0;
2974
+ if ((PacketSize % 4) == 0) // TODO enable vectorized transposition for PacketSize==2 ??
2457
2975
  {
2458
- for(; k<peeled_k; k+=PacketSize) {
2459
- PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
2460
- kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461
- kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462
- kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463
- kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2976
+ for (; k < peeled_k; k += PacketSize) {
2977
+ PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
2978
+ kernel.packet[0] = dm0.template loadPacket<Packet>(k);
2979
+ kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2980
+ kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2981
+ kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2464
2982
  ptranspose(kernel);
2465
- pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466
- pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467
- pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468
- pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469
- count+=4*PacketSize;
2983
+ pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
2984
+ pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
2985
+ pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
2986
+ pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
2987
+ count += 4 * PacketSize;
2470
2988
  }
2471
2989
  }
2472
- for(; k<depth; k++)
2473
- {
2474
- blockB[count+0] = cj(dm0(k));
2475
- blockB[count+1] = cj(dm1(k));
2476
- blockB[count+2] = cj(dm2(k));
2477
- blockB[count+3] = cj(dm3(k));
2990
+ for (; k < depth; k++) {
2991
+ blockB[count + 0] = cj(dm0(k));
2992
+ blockB[count + 1] = cj(dm1(k));
2993
+ blockB[count + 2] = cj(dm2(k));
2994
+ blockB[count + 3] = cj(dm3(k));
2478
2995
  count += 4;
2479
2996
  }
2480
2997
  // skip what we have after
2481
- if(PanelMode) count += 4 * (stride-offset-depth);
2998
+ if (PanelMode) count += 4 * (stride - offset - depth);
2482
2999
  }
2483
3000
  }
2484
3001
 
2485
3002
  // copy the remaining columns one at a time (nr==1)
2486
- for(Index j2=packet_cols4; j2<cols; ++j2)
2487
- {
2488
- if(PanelMode) count += offset;
3003
+ for (Index j2 = packet_cols4; j2 < cols; ++j2) {
3004
+ if (PanelMode) count += offset;
2489
3005
  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2490
- for(Index k=0; k<depth; k++)
2491
- {
3006
+ for (Index k = 0; k < depth; k++) {
2492
3007
  blockB[count] = cj(dm0(k));
2493
3008
  count += 1;
2494
3009
  }
2495
- if(PanelMode) count += (stride-offset-depth);
3010
+ if (PanelMode) count += (stride - offset - depth);
2496
3011
  }
2497
3012
  }
2498
3013
 
2499
3014
  // this version is optimized for row major matrices
2500
- template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2501
- struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2502
- {
3015
+ template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3016
+ struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
2503
3017
  typedef typename packet_traits<Scalar>::type Packet;
2504
3018
  typedef typename unpacket_traits<Packet>::half HalfPacket;
2505
3019
  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2506
3020
  typedef typename DataMapper::LinearMapper LinearMapper;
2507
- enum { PacketSize = packet_traits<Scalar>::size,
2508
- HalfPacketSize = unpacket_traits<HalfPacket>::size,
2509
- QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
2510
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2511
- {
3021
+ enum {
3022
+ PacketSize = packet_traits<Scalar>::size,
3023
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
3024
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size
3025
+ };
3026
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
3027
+ Index offset = 0) {
2512
3028
  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2513
3029
  EIGEN_UNUSED_VARIABLE(stride);
2514
3030
  EIGEN_UNUSED_VARIABLE(offset);
2515
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
3031
+ eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2516
3032
  const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2517
3033
  const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2518
3034
  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2519
- Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520
- Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
3035
+ Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
3036
+ Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
2521
3037
  Index count = 0;
2522
3038
 
2523
- // if(nr>=8)
2524
- // {
2525
- // for(Index j2=0; j2<packet_cols8; j2+=8)
2526
- // {
2527
- // // skip what we have before
2528
- // if(PanelMode) count += 8 * offset;
2529
- // for(Index k=0; k<depth; k++)
2530
- // {
2531
- // if (PacketSize==8) {
2532
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2533
- // pstoreu(blockB+count, cj.pconj(A));
2534
- // } else if (PacketSize==4) {
2535
- // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2536
- // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2537
- // pstoreu(blockB+count, cj.pconj(A));
2538
- // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2539
- // } else {
2540
- // const Scalar* b0 = &rhs[k*rhsStride + j2];
2541
- // blockB[count+0] = cj(b0[0]);
2542
- // blockB[count+1] = cj(b0[1]);
2543
- // blockB[count+2] = cj(b0[2]);
2544
- // blockB[count+3] = cj(b0[3]);
2545
- // blockB[count+4] = cj(b0[4]);
2546
- // blockB[count+5] = cj(b0[5]);
2547
- // blockB[count+6] = cj(b0[6]);
2548
- // blockB[count+7] = cj(b0[7]);
2549
- // }
2550
- // count += 8;
2551
- // }
2552
- // // skip what we have after
2553
- // if(PanelMode) count += 8 * (stride-offset-depth);
2554
- // }
2555
- // }
2556
- if(nr>=4)
2557
- {
2558
- for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2559
- {
3039
+ #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
3040
+ EIGEN_IF_CONSTEXPR(nr >= 8) {
3041
+ for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2560
3042
  // skip what we have before
2561
- if(PanelMode) count += 4 * offset;
2562
- for(Index k=0; k<depth; k++)
2563
- {
2564
- if (PacketSize==4) {
3043
+ if (PanelMode) count += 8 * offset;
3044
+ for (Index k = 0; k < depth; k++) {
3045
+ if (PacketSize == 8) {
2565
3046
  Packet A = rhs.template loadPacket<Packet>(k, j2);
2566
- pstoreu(blockB+count, cj.pconj(A));
3047
+ pstoreu(blockB + count, cj.pconj(A));
2567
3048
  count += PacketSize;
2568
- } else if (HasHalf && HalfPacketSize==4) {
3049
+ } else if (PacketSize == 4) {
3050
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
3051
+ Packet B = rhs.template loadPacket<Packet>(k, j2 + 4);
3052
+ pstoreu(blockB + count, cj.pconj(A));
3053
+ pstoreu(blockB + count + PacketSize, cj.pconj(B));
3054
+ count += 2 * PacketSize;
3055
+ } else {
3056
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3057
+ blockB[count + 0] = cj(dm0(0));
3058
+ blockB[count + 1] = cj(dm0(1));
3059
+ blockB[count + 2] = cj(dm0(2));
3060
+ blockB[count + 3] = cj(dm0(3));
3061
+ blockB[count + 4] = cj(dm0(4));
3062
+ blockB[count + 5] = cj(dm0(5));
3063
+ blockB[count + 6] = cj(dm0(6));
3064
+ blockB[count + 7] = cj(dm0(7));
3065
+ count += 8;
3066
+ }
3067
+ }
3068
+ // skip what we have after
3069
+ if (PanelMode) count += 8 * (stride - offset - depth);
3070
+ }
3071
+ }
3072
+ #endif
3073
+
3074
+ if (nr >= 4) {
3075
+ for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
3076
+ // skip what we have before
3077
+ if (PanelMode) count += 4 * offset;
3078
+ for (Index k = 0; k < depth; k++) {
3079
+ if (PacketSize == 4) {
3080
+ Packet A = rhs.template loadPacket<Packet>(k, j2);
3081
+ pstoreu(blockB + count, cj.pconj(A));
3082
+ count += PacketSize;
3083
+ } else if (HasHalf && HalfPacketSize == 4) {
2569
3084
  HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570
- pstoreu(blockB+count, cj.pconj(A));
3085
+ pstoreu(blockB + count, cj.pconj(A));
2571
3086
  count += HalfPacketSize;
2572
- } else if (HasQuarter && QuarterPacketSize==4) {
3087
+ } else if (HasQuarter && QuarterPacketSize == 4) {
2573
3088
  QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574
- pstoreu(blockB+count, cj.pconj(A));
3089
+ pstoreu(blockB + count, cj.pconj(A));
2575
3090
  count += QuarterPacketSize;
2576
3091
  } else {
2577
3092
  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578
- blockB[count+0] = cj(dm0(0));
2579
- blockB[count+1] = cj(dm0(1));
2580
- blockB[count+2] = cj(dm0(2));
2581
- blockB[count+3] = cj(dm0(3));
3093
+ blockB[count + 0] = cj(dm0(0));
3094
+ blockB[count + 1] = cj(dm0(1));
3095
+ blockB[count + 2] = cj(dm0(2));
3096
+ blockB[count + 3] = cj(dm0(3));
2582
3097
  count += 4;
2583
3098
  }
2584
3099
  }
2585
3100
  // skip what we have after
2586
- if(PanelMode) count += 4 * (stride-offset-depth);
3101
+ if (PanelMode) count += 4 * (stride - offset - depth);
2587
3102
  }
2588
3103
  }
2589
3104
  // copy the remaining columns one at a time (nr==1)
2590
- for(Index j2=packet_cols4; j2<cols; ++j2)
2591
- {
2592
- if(PanelMode) count += offset;
2593
- for(Index k=0; k<depth; k++)
2594
- {
3105
+ for (Index j2 = packet_cols4; j2 < cols; ++j2) {
3106
+ if (PanelMode) count += offset;
3107
+ for (Index k = 0; k < depth; k++) {
2595
3108
  blockB[count] = cj(rhs(k, j2));
2596
3109
  count += 1;
2597
3110
  }
2598
- if(PanelMode) count += stride-offset-depth;
3111
+ if (PanelMode) count += stride - offset - depth;
2599
3112
  }
2600
3113
  }
2601
3114
  };
2602
3115
 
2603
- } // end namespace internal
3116
+ } // end namespace internal
2604
3117
 
2605
3118
  /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
2606
- * \sa setCpuCacheSize */
2607
- inline std::ptrdiff_t l1CacheSize()
2608
- {
3119
+ * \sa setCpuCacheSize */
3120
+ inline std::ptrdiff_t l1CacheSize() {
2609
3121
  std::ptrdiff_t l1, l2, l3;
2610
3122
  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2611
3123
  return l1;
2612
3124
  }
2613
3125
 
2614
3126
  /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
2615
- * \sa setCpuCacheSize */
2616
- inline std::ptrdiff_t l2CacheSize()
2617
- {
3127
+ * \sa setCpuCacheSize */
3128
+ inline std::ptrdiff_t l2CacheSize() {
2618
3129
  std::ptrdiff_t l1, l2, l3;
2619
3130
  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2620
3131
  return l2;
2621
3132
  }
2622
3133
 
2623
- /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
2624
- rs.
2625
- * \sa setCpuCacheSize */
2626
- inline std::ptrdiff_t l3CacheSize()
2627
- {
3134
+ /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
3135
+ * \sa setCpuCacheSize */
3136
+ inline std::ptrdiff_t l3CacheSize() {
2628
3137
  std::ptrdiff_t l1, l2, l3;
2629
3138
  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2630
3139
  return l3;
2631
3140
  }
2632
3141
 
2633
3142
  /** Set the cpu L1 and L2 cache sizes (in bytes).
2634
- * These values are use to adjust the size of the blocks
2635
- * for the algorithms working per blocks.
2636
- *
2637
- * \sa computeProductBlockingSizes */
2638
- inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2639
- {
3143
+ * These values are use to adjust the size of the blocks
3144
+ * for the algorithms working per blocks.
3145
+ *
3146
+ * \sa computeProductBlockingSizes */
3147
+ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) {
2640
3148
  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2641
3149
  }
2642
3150
 
2643
- } // end namespace Eigen
3151
+ } // end namespace Eigen
2644
3152
 
2645
- #endif // EIGEN_GENERAL_BLOCK_PANEL_H
3153
+ #endif // EIGEN_GENERAL_BLOCK_PANEL_H