@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -1,7 +1,7 @@
1
1
  // This file is part of Eigen, a lightweight C++ template library
2
2
  // for linear algebra.
3
3
  //
4
- // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
4
+ // Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
5
5
  //
6
6
  // This Source Code Form is subject to the terms of the Mozilla
7
7
  // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,15 +10,61 @@
10
10
  #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11
11
  #define EIGEN_GENERAL_MATRIX_VECTOR_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../InternalHeaderCheck.h"
15
+
13
16
  namespace Eigen {
14
17
 
15
18
  namespace internal {
16
19
 
20
+ enum GEMVPacketSizeType { GEMVPacketFull = 0, GEMVPacketHalf, GEMVPacketQuarter };
21
+
22
+ template <int N, typename T1, typename T2, typename T3>
23
+ struct gemv_packet_cond {
24
+ typedef T3 type;
25
+ };
26
+
27
+ template <typename T1, typename T2, typename T3>
28
+ struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
29
+ typedef T1 type;
30
+ };
31
+
32
+ template <typename T1, typename T2, typename T3>
33
+ struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
34
+ typedef T2 type;
35
+ };
36
+
37
+ template <typename LhsScalar, typename RhsScalar, int PacketSize_ = GEMVPacketFull>
38
+ class gemv_traits {
39
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
40
+
41
+ #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
42
+ typedef typename gemv_packet_cond< \
43
+ packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
44
+ typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
45
+
46
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
47
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
48
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
49
+ #undef PACKET_DECL_COND_POSTFIX
50
+
51
+ public:
52
+ enum {
53
+ Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable &&
54
+ int(unpacket_traits<LhsPacket_>::size) == int(unpacket_traits<RhsPacket_>::size),
55
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
56
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
57
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1
58
+ };
59
+
60
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
61
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
62
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
63
+ };
64
+
17
65
  /* Optimized col-major matrix * vector product:
18
- * This algorithm processes 4 columns at onces that allows to both reduce
19
- * the number of load/stores of the result by a factor 4 and to reduce
20
- * the instruction dependency. Moreover, we know that all bands have the
21
- * same alignment pattern.
66
+ * This algorithm processes the matrix per vertical panels,
67
+ * which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
22
68
  *
23
69
  * Mixing type logic: C += alpha * A * B
24
70
  * | A | B |alpha| comments
@@ -27,302 +73,193 @@ namespace internal {
27
73
  * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
28
74
  * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
29
75
  *
30
- * Accesses to the matrix coefficients follow the following logic:
31
- *
32
- * - if all columns have the same alignment then
33
- * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
34
- * - otherwise perform unaligned loads only (-> NoneAligned case)
35
- * - otherwise
36
- * - if even columns have the same alignment then
37
- * // odd columns are guaranteed to have the same alignment too
38
- * - if even or odd columns have the same alignment as the result, then
39
- * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
40
- * - perform half aligned and half unaligned loads (-> EvenAligned case)
41
- * - otherwise perform unaligned loads only (-> NoneAligned case)
42
- * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
43
- * - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
44
- * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
45
- * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
46
- * - otherwise,
47
- * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
48
- * // we currently fall back to the NoneAligned case
49
- *
50
76
  * The same reasoning apply for the transposed case.
51
- *
52
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
53
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
54
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
55
- * compared to unaligned loads on a 4 byte boundary.
56
- *
57
77
  */
58
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
59
- struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
60
- {
78
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
79
+ typename RhsMapper, bool ConjugateRhs, int Version>
80
+ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,
81
+ ConjugateRhs, Version> {
82
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
83
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
84
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
85
+
61
86
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
62
87
 
63
- enum {
64
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
65
- && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
66
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
67
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
68
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
69
- };
88
+ typedef typename Traits::LhsPacket LhsPacket;
89
+ typedef typename Traits::RhsPacket RhsPacket;
90
+ typedef typename Traits::ResPacket ResPacket;
70
91
 
71
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
72
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
73
- typedef typename packet_traits<ResScalar>::type _ResPacket;
92
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
93
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
94
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
74
95
 
75
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
76
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
77
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
96
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
97
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
98
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
78
99
 
79
- EIGEN_DONT_INLINE static void run(
80
- Index rows, Index cols,
81
- const LhsMapper& lhs,
82
- const RhsMapper& rhs,
83
- ResScalar* res, Index resIncr,
84
- RhsScalar alpha);
100
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
101
+ const RhsMapper& rhs, ResScalar* res, Index resIncr,
102
+ RhsScalar alpha);
85
103
  };
86
104
 
87
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
88
- EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
89
- Index rows, Index cols,
90
- const LhsMapper& lhs,
91
- const RhsMapper& rhs,
92
- ResScalar* res, Index resIncr,
93
- RhsScalar alpha)
94
- {
105
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
106
+ typename RhsMapper, bool ConjugateRhs, int Version>
107
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
108
+ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
109
+ Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
110
+ ResScalar* res, Index resIncr, RhsScalar alpha) {
95
111
  EIGEN_UNUSED_VARIABLE(resIncr);
96
- eigen_internal_assert(resIncr==1);
97
- #ifdef _EIGEN_ACCUMULATE_PACKETS
98
- #error _EIGEN_ACCUMULATE_PACKETS has already been defined
99
- #endif
100
- #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
101
- pstore(&res[j], \
102
- padd(pload<ResPacket>(&res[j]), \
103
- padd( \
104
- padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
105
- pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
106
- padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
107
- pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
108
-
109
- typedef typename LhsMapper::VectorMapper LhsScalars;
110
-
111
- conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
112
- conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
113
- if(ConjugateRhs)
114
- alpha = numext::conj(alpha);
115
-
116
- enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
117
- const Index columnsAtOnce = 4;
118
- const Index peels = 2;
119
- const Index LhsPacketAlignedMask = LhsPacketSize-1;
120
- const Index ResPacketAlignedMask = ResPacketSize-1;
121
- // const Index PeelAlignedMask = ResPacketSize*peels-1;
122
- const Index size = rows;
112
+ eigen_internal_assert(resIncr == 1);
123
113
 
124
- const Index lhsStride = lhs.stride();
114
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
115
+ // This helps GCC to generate proper code.
116
+ LhsMapper lhs(alhs);
125
117
 
126
- // How many coeffs of the result do we have to skip to be aligned.
127
- // Here we assume data are at least aligned on the base scalar type.
128
- Index alignedStart = internal::first_default_aligned(res,size);
129
- Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
130
- const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
131
-
132
- const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
133
- Index alignmentPattern = alignmentStep==0 ? AllAligned
134
- : alignmentStep==(LhsPacketSize/2) ? EvenAligned
135
- : FirstAligned;
136
-
137
- // we cannot assume the first element is aligned because of sub-matrices
138
- const Index lhsAlignmentOffset = lhs.firstAligned(size);
139
-
140
- // find how many columns do we have to skip to be aligned with the result (if possible)
141
- Index skipColumns = 0;
142
- // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
143
- if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
144
- {
145
- alignedSize = 0;
146
- alignedStart = 0;
147
- alignmentPattern = NoneAligned;
148
- }
149
- else if(LhsPacketSize > 4)
150
- {
151
- // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
152
- // Currently, it seems to be better to perform unaligned loads anyway
153
- alignmentPattern = NoneAligned;
154
- }
155
- else if (LhsPacketSize>1)
156
- {
157
- // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
158
-
159
- while (skipColumns<LhsPacketSize &&
160
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
161
- ++skipColumns;
162
- if (skipColumns==LhsPacketSize)
163
- {
164
- // nothing can be aligned, no need to skip any column
165
- alignmentPattern = NoneAligned;
166
- skipColumns = 0;
118
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
119
+ conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
120
+ conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
121
+ conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
122
+
123
+ const Index lhsStride = lhs.stride();
124
+ // TODO: for padded aligned inputs, we could enable aligned reads
125
+ enum {
126
+ LhsAlignment = Unaligned,
127
+ ResPacketSize = Traits::ResPacketSize,
128
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
129
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
130
+ LhsPacketSize = Traits::LhsPacketSize,
131
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
132
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
133
+ };
134
+
135
+ const Index n8 = rows - 8 * ResPacketSize + 1;
136
+ const Index n4 = rows - 4 * ResPacketSize + 1;
137
+ const Index n3 = rows - 3 * ResPacketSize + 1;
138
+ const Index n2 = rows - 2 * ResPacketSize + 1;
139
+ const Index n1 = rows - 1 * ResPacketSize + 1;
140
+ const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
141
+ const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
142
+
143
+ // TODO: improve the following heuristic:
144
+ const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
145
+ ResPacket palpha = pset1<ResPacket>(alpha);
146
+ ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
147
+ ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
148
+
149
+ for (Index j2 = 0; j2 < cols; j2 += block_cols) {
150
+ Index jend = numext::mini(j2 + block_cols, cols);
151
+ Index i = 0;
152
+ for (; i < n8; i += ResPacketSize * 8) {
153
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
154
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
155
+ c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
156
+ c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
157
+
158
+ for (Index j = j2; j < jend; j += 1) {
159
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
160
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
161
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
162
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
163
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
164
+ c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
165
+ c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
166
+ c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
167
+ c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
168
+ }
169
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
170
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
171
+ pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
172
+ pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
173
+ pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
174
+ pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
175
+ pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
176
+ pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
167
177
  }
168
- else
169
- {
170
- skipColumns = (std::min)(skipColumns,cols);
171
- // note that the skiped columns are processed later.
178
+ if (i < n4) {
179
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
180
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
181
+
182
+ for (Index j = j2; j < jend; j += 1) {
183
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
184
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
185
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
186
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
187
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
188
+ }
189
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
190
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
191
+ pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
192
+ pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
193
+
194
+ i += ResPacketSize * 4;
172
195
  }
196
+ if (i < n3) {
197
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
198
+ c2 = pset1<ResPacket>(ResScalar(0));
199
+
200
+ for (Index j = j2; j < jend; j += 1) {
201
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
202
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
203
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
204
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
205
+ }
206
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
207
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
208
+ pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
173
209
 
174
- /* eigen_internal_assert( (alignmentPattern==NoneAligned)
175
- || (skipColumns + columnsAtOnce >= cols)
176
- || LhsPacketSize > size
177
- || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
178
- }
179
- else if(Vectorizable)
180
- {
181
- alignedStart = 0;
182
- alignedSize = size;
183
- alignmentPattern = AllAligned;
184
- }
210
+ i += ResPacketSize * 3;
211
+ }
212
+ if (i < n2) {
213
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
185
214
 
186
- const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
187
- const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
188
-
189
- Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
190
- for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
191
- {
192
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
193
- ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
194
- ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
195
- ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
196
-
197
- // this helps a lot generating better binary code
198
- const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
199
- lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
200
-
201
- if (Vectorizable)
202
- {
203
- /* explicit vectorization */
204
- // process initial unaligned coeffs
205
- for (Index j=0; j<alignedStart; ++j)
206
- {
207
- res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
208
- res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
209
- res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
210
- res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
215
+ for (Index j = j2; j < jend; j += 1) {
216
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
217
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
218
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
211
219
  }
212
-
213
- if (alignedSize>alignedStart)
214
- {
215
- switch(alignmentPattern)
216
- {
217
- case AllAligned:
218
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
219
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
220
- break;
221
- case EvenAligned:
222
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
223
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
224
- break;
225
- case FirstAligned:
226
- {
227
- Index j = alignedStart;
228
- if(peels>1)
229
- {
230
- LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
231
- ResPacket T0, T1;
232
-
233
- A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
234
- A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
235
- A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
236
-
237
- for (; j<peeledSize; j+=peels*ResPacketSize)
238
- {
239
- A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
240
- A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
241
- A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
242
-
243
- A00 = lhs0.template load<LhsPacket, Aligned>(j);
244
- A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
245
- T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
246
- T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
247
-
248
- T0 = pcj.pmadd(A01, ptmp1, T0);
249
- A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
250
- T0 = pcj.pmadd(A02, ptmp2, T0);
251
- A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
252
- T0 = pcj.pmadd(A03, ptmp3, T0);
253
- pstore(&res[j],T0);
254
- A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
255
- T1 = pcj.pmadd(A11, ptmp1, T1);
256
- T1 = pcj.pmadd(A12, ptmp2, T1);
257
- T1 = pcj.pmadd(A13, ptmp3, T1);
258
- pstore(&res[j+ResPacketSize],T1);
259
- }
260
- }
261
- for (; j<alignedSize; j+=ResPacketSize)
262
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
263
- break;
264
- }
265
- default:
266
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
267
- _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
268
- break;
269
- }
220
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
221
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
222
+ i += ResPacketSize * 2;
223
+ }
224
+ if (i < n1) {
225
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
226
+ for (Index j = j2; j < jend; j += 1) {
227
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
228
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
270
229
  }
271
- } // end explicit vectorization
272
-
273
- /* process remaining coeffs (or all if there is no explicit vectorization) */
274
- for (Index j=alignedSize; j<size; ++j)
275
- {
276
- res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
277
- res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
278
- res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
279
- res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
230
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
231
+ i += ResPacketSize;
280
232
  }
281
- }
282
-
283
- // process remaining first and last columns (at most columnsAtOnce-1)
284
- Index end = cols;
285
- Index start = columnBound;
286
- do
287
- {
288
- for (Index k=start; k<end; ++k)
289
- {
290
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
291
- const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
292
-
293
- if (Vectorizable)
294
- {
295
- /* explicit vectorization */
296
- // process first unaligned result's coeffs
297
- for (Index j=0; j<alignedStart; ++j)
298
- res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
299
- // process aligned result's coeffs
300
- if (lhs0.template aligned<LhsPacket>(alignedStart))
301
- for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
302
- pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
303
- else
304
- for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
305
- pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
233
+ if (HasHalf && i < n_half) {
234
+ ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
235
+ for (Index j = j2; j < jend; j += 1) {
236
+ RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
237
+ c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
306
238
  }
307
-
308
- // process remaining scalars (or all if no explicit vectorization)
309
- for (Index i=alignedSize; i<size; ++i)
310
- res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
239
+ pstoreu(res + i + ResPacketSizeHalf * 0,
240
+ pmadd(c0, palpha_half, ploadu<ResPacketHalf>(res + i + ResPacketSizeHalf * 0)));
241
+ i += ResPacketSizeHalf;
242
+ }
243
+ if (HasQuarter && i < n_quarter) {
244
+ ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
245
+ for (Index j = j2; j < jend; j += 1) {
246
+ RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
247
+ c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
248
+ }
249
+ pstoreu(res + i + ResPacketSizeQuarter * 0,
250
+ pmadd(c0, palpha_quarter, ploadu<ResPacketQuarter>(res + i + ResPacketSizeQuarter * 0)));
251
+ i += ResPacketSizeQuarter;
311
252
  }
312
- if (skipColumns)
313
- {
314
- start = 0;
315
- end = skipColumns;
316
- skipColumns = 0;
253
+ for (; i < rows; ++i) {
254
+ ResScalar c0(0);
255
+ for (Index j = j2; j < jend; j += 1) c0 += cj.pmul(lhs(i, j), rhs(j, 0));
256
+ res[i] += alpha * c0;
317
257
  }
318
- else
319
- break;
320
- } while(Vectorizable);
321
- #undef _EIGEN_ACCUMULATE_PACKETS
258
+ }
322
259
  }
323
260
 
324
261
  /* Optimized row-major matrix * vector product:
325
- * This algorithm processes 4 rows at onces that allows to both reduce
262
+ * This algorithm processes 4 rows at once that allows to both reduce
326
263
  * the number of load/stores of the result by a factor 4 and to reduce
327
264
  * the instruction dependency. Moreover, we know that all bands have the
328
265
  * same alignment pattern.
@@ -331,289 +268,206 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
331
268
  * - alpha is always a complex (or converted to a complex)
332
269
  * - no vectorization
333
270
  */
334
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
335
- struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
336
- {
337
- typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
338
-
339
- enum {
340
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
341
- && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
342
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
343
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
344
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
345
- };
271
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
272
+ typename RhsMapper, bool ConjugateRhs, int Version>
273
+ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,
274
+ ConjugateRhs, Version> {
275
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
276
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
277
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
346
278
 
347
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
348
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
349
- typedef typename packet_traits<ResScalar>::type _ResPacket;
279
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
350
280
 
351
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
352
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
353
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
281
+ typedef typename Traits::LhsPacket LhsPacket;
282
+ typedef typename Traits::RhsPacket RhsPacket;
283
+ typedef typename Traits::ResPacket ResPacket;
354
284
 
355
- EIGEN_DONT_INLINE static void run(
356
- Index rows, Index cols,
357
- const LhsMapper& lhs,
358
- const RhsMapper& rhs,
359
- ResScalar* res, Index resIncr,
360
- ResScalar alpha);
361
- };
285
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
286
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
287
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
362
288
 
363
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
364
- EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
365
- Index rows, Index cols,
366
- const LhsMapper& lhs,
367
- const RhsMapper& rhs,
368
- ResScalar* res, Index resIncr,
369
- ResScalar alpha)
370
- {
371
- eigen_internal_assert(rhs.stride()==1);
372
-
373
- #ifdef _EIGEN_ACCUMULATE_PACKETS
374
- #error _EIGEN_ACCUMULATE_PACKETS has already been defined
375
- #endif
376
-
377
- #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
378
- RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
379
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
380
- ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
381
- ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
382
- ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
383
-
384
- conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
385
- conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
386
-
387
- typedef typename LhsMapper::VectorMapper LhsScalars;
388
-
389
- enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
390
- const Index rowsAtOnce = 4;
391
- const Index peels = 2;
392
- const Index RhsPacketAlignedMask = RhsPacketSize-1;
393
- const Index LhsPacketAlignedMask = LhsPacketSize-1;
394
- const Index depth = cols;
395
- const Index lhsStride = lhs.stride();
289
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
290
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
291
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
396
292
 
397
- // How many coeffs of the result do we have to skip to be aligned.
398
- // Here we assume data are at least aligned on the base scalar type
399
- // if that's not the case then vectorization is discarded, see below.
400
- Index alignedStart = rhs.firstAligned(depth);
401
- Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
402
- const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
403
-
404
- const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
405
- Index alignmentPattern = alignmentStep==0 ? AllAligned
406
- : alignmentStep==(LhsPacketSize/2) ? EvenAligned
407
- : FirstAligned;
408
-
409
- // we cannot assume the first element is aligned because of sub-matrices
410
- const Index lhsAlignmentOffset = lhs.firstAligned(depth);
411
- const Index rhsAlignmentOffset = rhs.firstAligned(rows);
412
-
413
- // find how many rows do we have to skip to be aligned with rhs (if possible)
414
- Index skipRows = 0;
415
- // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
416
- if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
417
- (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
418
- (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
419
- {
420
- alignedSize = 0;
421
- alignedStart = 0;
422
- alignmentPattern = NoneAligned;
423
- }
424
- else if(LhsPacketSize > 4)
425
- {
426
- // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
427
- alignmentPattern = NoneAligned;
428
- }
429
- else if (LhsPacketSize>1)
430
- {
431
- // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
432
-
433
- while (skipRows<LhsPacketSize &&
434
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
435
- ++skipRows;
436
- if (skipRows==LhsPacketSize)
437
- {
438
- // nothing can be aligned, no need to skip any column
439
- alignmentPattern = NoneAligned;
440
- skipRows = 0;
293
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
294
+ const RhsMapper& rhs, ResScalar* res, Index resIncr,
295
+ ResScalar alpha);
296
+ };
297
+
298
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
299
+ typename RhsMapper, bool ConjugateRhs, int Version>
300
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
301
+ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
302
+ Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
303
+ ResScalar* res, Index resIncr, ResScalar alpha) {
304
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
305
+ // This helps GCC to generate proper code.
306
+ LhsMapper lhs(alhs);
307
+
308
+ eigen_internal_assert(rhs.stride() == 1);
309
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
310
+ conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
311
+ conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
312
+ conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
313
+
314
+ // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
315
+ // processing 8 rows at once might be counter productive wrt cache.
316
+ const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
317
+ const Index n4 = rows - 3;
318
+ const Index n2 = rows - 1;
319
+
320
+ // TODO: for padded aligned inputs, we could enable aligned reads
321
+ enum {
322
+ LhsAlignment = Unaligned,
323
+ ResPacketSize = Traits::ResPacketSize,
324
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
325
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
326
+ LhsPacketSize = Traits::LhsPacketSize,
327
+ LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
328
+ LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
329
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
330
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
331
+ };
332
+
333
+ using UnsignedIndex = typename make_unsigned<Index>::type;
334
+ const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
335
+ const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
336
+ const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
337
+
338
+ Index i = 0;
339
+ for (; i < n8; i += 8) {
340
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
341
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
342
+ c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
343
+ c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
344
+
345
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
346
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
347
+
348
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
349
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
350
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
351
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
352
+ c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 4, j), b0, c4);
353
+ c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 5, j), b0, c5);
354
+ c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 6, j), b0, c6);
355
+ c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 7, j), b0, c7);
441
356
  }
442
- else
443
- {
444
- skipRows = (std::min)(skipRows,Index(rows));
445
- // note that the skiped columns are processed later.
357
+ ResScalar cc0 = predux(c0);
358
+ ResScalar cc1 = predux(c1);
359
+ ResScalar cc2 = predux(c2);
360
+ ResScalar cc3 = predux(c3);
361
+ ResScalar cc4 = predux(c4);
362
+ ResScalar cc5 = predux(c5);
363
+ ResScalar cc6 = predux(c6);
364
+ ResScalar cc7 = predux(c7);
365
+
366
+ for (Index j = fullColBlockEnd; j < cols; ++j) {
367
+ RhsScalar b0 = rhs(j, 0);
368
+
369
+ cc0 += cj.pmul(lhs(i + 0, j), b0);
370
+ cc1 += cj.pmul(lhs(i + 1, j), b0);
371
+ cc2 += cj.pmul(lhs(i + 2, j), b0);
372
+ cc3 += cj.pmul(lhs(i + 3, j), b0);
373
+ cc4 += cj.pmul(lhs(i + 4, j), b0);
374
+ cc5 += cj.pmul(lhs(i + 5, j), b0);
375
+ cc6 += cj.pmul(lhs(i + 6, j), b0);
376
+ cc7 += cj.pmul(lhs(i + 7, j), b0);
446
377
  }
447
- /* eigen_internal_assert( alignmentPattern==NoneAligned
448
- || LhsPacketSize==1
449
- || (skipRows + rowsAtOnce >= rows)
450
- || LhsPacketSize > depth
451
- || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
452
- }
453
- else if(Vectorizable)
454
- {
455
- alignedStart = 0;
456
- alignedSize = depth;
457
- alignmentPattern = AllAligned;
378
+ res[(i + 0) * resIncr] += alpha * cc0;
379
+ res[(i + 1) * resIncr] += alpha * cc1;
380
+ res[(i + 2) * resIncr] += alpha * cc2;
381
+ res[(i + 3) * resIncr] += alpha * cc3;
382
+ res[(i + 4) * resIncr] += alpha * cc4;
383
+ res[(i + 5) * resIncr] += alpha * cc5;
384
+ res[(i + 6) * resIncr] += alpha * cc6;
385
+ res[(i + 7) * resIncr] += alpha * cc7;
458
386
  }
387
+ for (; i < n4; i += 4) {
388
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
389
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
459
390
 
460
- const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
461
- const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
462
-
463
- Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
464
- for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
465
- {
466
- // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
467
- EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
468
- ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
469
-
470
- // this helps the compiler generating good binary code
471
- const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
472
- lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
473
-
474
- if (Vectorizable)
475
- {
476
- /* explicit vectorization */
477
- ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
478
- ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
479
-
480
- // process initial unaligned coeffs
481
- // FIXME this loop get vectorized by the compiler !
482
- for (Index j=0; j<alignedStart; ++j)
483
- {
484
- RhsScalar b = rhs(j, 0);
485
- tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
486
- tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
487
- }
391
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
392
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
488
393
 
489
- if (alignedSize>alignedStart)
490
- {
491
- switch(alignmentPattern)
492
- {
493
- case AllAligned:
494
- for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
495
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
496
- break;
497
- case EvenAligned:
498
- for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
499
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
500
- break;
501
- case FirstAligned:
502
- {
503
- Index j = alignedStart;
504
- if (peels>1)
505
- {
506
- /* Here we proccess 4 rows with with two peeled iterations to hide
507
- * the overhead of unaligned loads. Moreover unaligned loads are handled
508
- * using special shift/move operations between the two aligned packets
509
- * overlaping the desired unaligned packet. This is *much* more efficient
510
- * than basic unaligned loads.
511
- */
512
- LhsPacket A01, A02, A03, A11, A12, A13;
513
- A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
514
- A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
515
- A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
516
-
517
- for (; j<peeledSize; j+=peels*RhsPacketSize)
518
- {
519
- RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
520
- A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
521
- A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
522
- A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
523
-
524
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
525
- ptmp1 = pcj.pmadd(A01, b, ptmp1);
526
- A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
527
- ptmp2 = pcj.pmadd(A02, b, ptmp2);
528
- A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
529
- ptmp3 = pcj.pmadd(A03, b, ptmp3);
530
- A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
531
-
532
- b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
533
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
534
- ptmp1 = pcj.pmadd(A11, b, ptmp1);
535
- ptmp2 = pcj.pmadd(A12, b, ptmp2);
536
- ptmp3 = pcj.pmadd(A13, b, ptmp3);
537
- }
538
- }
539
- for (; j<alignedSize; j+=RhsPacketSize)
540
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
541
- break;
542
- }
543
- default:
544
- for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
545
- _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
546
- break;
547
- }
548
- tmp0 += predux(ptmp0);
549
- tmp1 += predux(ptmp1);
550
- tmp2 += predux(ptmp2);
551
- tmp3 += predux(ptmp3);
552
- }
553
- } // end explicit vectorization
554
-
555
- // process remaining coeffs (or all if no explicit vectorization)
556
- // FIXME this loop get vectorized by the compiler !
557
- for (Index j=alignedSize; j<depth; ++j)
558
- {
559
- RhsScalar b = rhs(j, 0);
560
- tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
561
- tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
394
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
395
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
396
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
397
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
398
+ }
399
+ ResScalar cc0 = predux(c0);
400
+ ResScalar cc1 = predux(c1);
401
+ ResScalar cc2 = predux(c2);
402
+ ResScalar cc3 = predux(c3);
403
+
404
+ for (Index j = fullColBlockEnd; j < cols; ++j) {
405
+ RhsScalar b0 = rhs(j, 0);
406
+
407
+ cc0 += cj.pmul(lhs(i + 0, j), b0);
408
+ cc1 += cj.pmul(lhs(i + 1, j), b0);
409
+ cc2 += cj.pmul(lhs(i + 2, j), b0);
410
+ cc3 += cj.pmul(lhs(i + 3, j), b0);
562
411
  }
563
- res[i*resIncr] += alpha*tmp0;
564
- res[(i+offset1)*resIncr] += alpha*tmp1;
565
- res[(i+2)*resIncr] += alpha*tmp2;
566
- res[(i+offset3)*resIncr] += alpha*tmp3;
412
+ res[(i + 0) * resIncr] += alpha * cc0;
413
+ res[(i + 1) * resIncr] += alpha * cc1;
414
+ res[(i + 2) * resIncr] += alpha * cc2;
415
+ res[(i + 3) * resIncr] += alpha * cc3;
567
416
  }
417
+ for (; i < n2; i += 2) {
418
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
568
419
 
569
- // process remaining first and last rows (at most columnsAtOnce-1)
570
- Index end = rows;
571
- Index start = rowBound;
572
- do
573
- {
574
- for (Index i=start; i<end; ++i)
575
- {
576
- EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
577
- ResPacket ptmp0 = pset1<ResPacket>(tmp0);
578
- const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
579
- // process first unaligned result's coeffs
580
- // FIXME this loop get vectorized by the compiler !
581
- for (Index j=0; j<alignedStart; ++j)
582
- tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
583
-
584
- if (alignedSize>alignedStart)
585
- {
586
- // process aligned rhs coeffs
587
- if (lhs0.template aligned<LhsPacket>(alignedStart))
588
- for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
589
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
590
- else
591
- for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
592
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
593
- tmp0 += predux(ptmp0);
594
- }
420
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
421
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
595
422
 
596
- // process remaining scalars
597
- // FIXME this loop get vectorized by the compiler !
598
- for (Index j=alignedSize; j<depth; ++j)
599
- tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
600
- res[i*resIncr] += alpha*tmp0;
601
- }
602
- if (skipRows)
603
- {
604
- start = 0;
605
- end = skipRows;
606
- skipRows = 0;
423
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
424
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
607
425
  }
608
- else
609
- break;
610
- } while(Vectorizable);
426
+ ResScalar cc0 = predux(c0);
427
+ ResScalar cc1 = predux(c1);
611
428
 
612
- #undef _EIGEN_ACCUMULATE_PACKETS
429
+ for (Index j = fullColBlockEnd; j < cols; ++j) {
430
+ RhsScalar b0 = rhs(j, 0);
431
+
432
+ cc0 += cj.pmul(lhs(i + 0, j), b0);
433
+ cc1 += cj.pmul(lhs(i + 1, j), b0);
434
+ }
435
+ res[(i + 0) * resIncr] += alpha * cc0;
436
+ res[(i + 1) * resIncr] += alpha * cc1;
437
+ }
438
+ for (; i < rows; ++i) {
439
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
440
+ ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
441
+ ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
442
+
443
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
444
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
445
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i, j), b0, c0);
446
+ }
447
+ ResScalar cc0 = predux(c0);
448
+ if (HasHalf) {
449
+ for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
450
+ RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
451
+ c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i, j), b0, c0_h);
452
+ }
453
+ cc0 += predux(c0_h);
454
+ }
455
+ if (HasQuarter) {
456
+ for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
457
+ RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
458
+ c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i, j), b0, c0_q);
459
+ }
460
+ cc0 += predux(c0_q);
461
+ }
462
+ for (Index j = quarterColBlockEnd; j < cols; ++j) {
463
+ cc0 += cj.pmul(lhs(i, j), rhs(j, 0));
464
+ }
465
+ res[i * resIncr] += alpha * cc0;
466
+ }
613
467
  }
614
468
 
615
- } // end namespace internal
469
+ } // end namespace internal
616
470
 
617
- } // end namespace Eigen
471
+ } // end namespace Eigen
618
472
 
619
- #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
473
+ #endif // EIGEN_GENERAL_MATRIX_VECTOR_H