@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -10,61 +10,61 @@
10
10
  #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11
11
  #define EIGEN_GENERAL_MATRIX_VECTOR_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../InternalHeaderCheck.h"
15
+
13
16
  namespace Eigen {
14
17
 
15
18
  namespace internal {
16
19
 
17
- enum GEMVPacketSizeType {
18
- GEMVPacketFull = 0,
19
- GEMVPacketHalf,
20
- GEMVPacketQuarter
21
- };
20
+ enum GEMVPacketSizeType { GEMVPacketFull = 0, GEMVPacketHalf, GEMVPacketQuarter };
22
21
 
23
22
  template <int N, typename T1, typename T2, typename T3>
24
- struct gemv_packet_cond { typedef T3 type; };
23
+ struct gemv_packet_cond {
24
+ typedef T3 type;
25
+ };
25
26
 
26
27
  template <typename T1, typename T2, typename T3>
27
- struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
28
+ struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
29
+ typedef T1 type;
30
+ };
28
31
 
29
32
  template <typename T1, typename T2, typename T3>
30
- struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
33
+ struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
34
+ typedef T2 type;
35
+ };
31
36
 
32
- template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
33
- class gemv_traits
34
- {
37
+ template <typename LhsScalar, typename RhsScalar, int PacketSize_ = GEMVPacketFull>
38
+ class gemv_traits {
35
39
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
36
40
 
37
- #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
38
- typedef typename gemv_packet_cond<packet_size, \
39
- typename packet_traits<name ## Scalar>::type, \
40
- typename packet_traits<name ## Scalar>::half, \
41
- typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42
- prefix ## name ## Packet
41
+ #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
42
+ typedef typename gemv_packet_cond< \
43
+ packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
44
+ typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
43
45
 
44
- PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45
- PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46
- PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47
- #undef PACKET_DECL_COND_PREFIX
46
+ PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
47
+ PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
48
+ PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
49
+ #undef PACKET_DECL_COND_POSTFIX
48
50
 
49
- public:
51
+ public:
50
52
  enum {
51
- Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
52
- unpacket_traits<_RhsPacket>::vectorizable &&
53
- int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
54
- LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
55
- RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
56
- ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
53
+ Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable &&
54
+ int(unpacket_traits<LhsPacket_>::size) == int(unpacket_traits<RhsPacket_>::size),
55
+ LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
56
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
57
+ ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1
57
58
  };
58
59
 
59
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
60
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
61
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
60
+ typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
61
+ typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
62
+ typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
62
63
  };
63
64
 
64
-
65
65
  /* Optimized col-major matrix * vector product:
66
66
  * This algorithm processes the matrix per vertical panels,
67
- * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
67
+ * which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
68
68
  *
69
69
  * Mixing type logic: C += alpha * A * B
70
70
  * | A | B |alpha| comments
@@ -75,12 +75,13 @@ public:
75
75
  *
76
76
  * The same reasoning apply for the transposed case.
77
77
  */
78
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
79
- struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
80
- {
81
- typedef gemv_traits<LhsScalar,RhsScalar> Traits;
82
- typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
83
- typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
78
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
79
+ typename RhsMapper, bool ConjugateRhs, int Version>
80
+ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,
81
+ ConjugateRhs, Version> {
82
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
83
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
84
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
84
85
 
85
86
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
86
87
 
@@ -96,190 +97,163 @@ struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugat
96
97
  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
97
98
  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
98
99
 
99
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
100
- Index rows, Index cols,
101
- const LhsMapper& lhs,
102
- const RhsMapper& rhs,
103
- ResScalar* res, Index resIncr,
104
- RhsScalar alpha);
100
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
101
+ const RhsMapper& rhs, ResScalar* res, Index resIncr,
102
+ RhsScalar alpha);
105
103
  };
106
104
 
107
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
108
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
109
- Index rows, Index cols,
110
- const LhsMapper& alhs,
111
- const RhsMapper& rhs,
112
- ResScalar* res, Index resIncr,
113
- RhsScalar alpha)
114
- {
105
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
106
+ typename RhsMapper, bool ConjugateRhs, int Version>
107
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
108
+ general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
109
+ Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
110
+ ResScalar* res, Index resIncr, RhsScalar alpha) {
115
111
  EIGEN_UNUSED_VARIABLE(resIncr);
116
- eigen_internal_assert(resIncr==1);
112
+ eigen_internal_assert(resIncr == 1);
117
113
 
118
114
  // The following copy tells the compiler that lhs's attributes are not modified outside this function
119
- // This helps GCC to generate propoer code.
115
+ // This helps GCC to generate proper code.
120
116
  LhsMapper lhs(alhs);
121
117
 
122
- conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
123
- conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
124
- conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
125
- conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
118
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
119
+ conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
120
+ conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
121
+ conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
126
122
 
127
123
  const Index lhsStride = lhs.stride();
128
124
  // TODO: for padded aligned inputs, we could enable aligned reads
129
- enum { LhsAlignment = Unaligned,
130
- ResPacketSize = Traits::ResPacketSize,
131
- ResPacketSizeHalf = HalfTraits::ResPacketSize,
132
- ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133
- LhsPacketSize = Traits::LhsPacketSize,
134
- HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
135
- HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
125
+ enum {
126
+ LhsAlignment = Unaligned,
127
+ ResPacketSize = Traits::ResPacketSize,
128
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
129
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
130
+ LhsPacketSize = Traits::LhsPacketSize,
131
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
132
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
136
133
  };
137
134
 
138
- const Index n8 = rows-8*ResPacketSize+1;
139
- const Index n4 = rows-4*ResPacketSize+1;
140
- const Index n3 = rows-3*ResPacketSize+1;
141
- const Index n2 = rows-2*ResPacketSize+1;
142
- const Index n1 = rows-1*ResPacketSize+1;
143
- const Index n_half = rows-1*ResPacketSizeHalf+1;
144
- const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
135
+ const Index n8 = rows - 8 * ResPacketSize + 1;
136
+ const Index n4 = rows - 4 * ResPacketSize + 1;
137
+ const Index n3 = rows - 3 * ResPacketSize + 1;
138
+ const Index n2 = rows - 2 * ResPacketSize + 1;
139
+ const Index n1 = rows - 1 * ResPacketSize + 1;
140
+ const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
141
+ const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
145
142
 
146
143
  // TODO: improve the following heuristic:
147
- const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
144
+ const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
148
145
  ResPacket palpha = pset1<ResPacket>(alpha);
149
146
  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150
147
  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
151
148
 
152
- for(Index j2=0; j2<cols; j2+=block_cols)
153
- {
154
- Index jend = numext::mini(j2+block_cols,cols);
155
- Index i=0;
156
- for(; i<n8; i+=ResPacketSize*8)
157
- {
158
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159
- c1 = pset1<ResPacket>(ResScalar(0)),
160
- c2 = pset1<ResPacket>(ResScalar(0)),
161
- c3 = pset1<ResPacket>(ResScalar(0)),
162
- c4 = pset1<ResPacket>(ResScalar(0)),
163
- c5 = pset1<ResPacket>(ResScalar(0)),
164
- c6 = pset1<ResPacket>(ResScalar(0)),
165
- c7 = pset1<ResPacket>(ResScalar(0));
166
-
167
- for(Index j=j2; j<jend; j+=1)
168
- {
169
- RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172
- c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173
- c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174
- c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175
- c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176
- c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177
- c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
149
+ for (Index j2 = 0; j2 < cols; j2 += block_cols) {
150
+ Index jend = numext::mini(j2 + block_cols, cols);
151
+ Index i = 0;
152
+ for (; i < n8; i += ResPacketSize * 8) {
153
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
154
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
155
+ c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
156
+ c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
157
+
158
+ for (Index j = j2; j < jend; j += 1) {
159
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
160
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
161
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
162
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
163
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
164
+ c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
165
+ c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
166
+ c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
167
+ c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
178
168
  }
179
- pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180
- pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181
- pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182
- pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183
- pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184
- pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185
- pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186
- pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
169
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
170
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
171
+ pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
172
+ pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
173
+ pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
174
+ pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
175
+ pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
176
+ pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
187
177
  }
188
- if(i<n4)
189
- {
190
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191
- c1 = pset1<ResPacket>(ResScalar(0)),
192
- c2 = pset1<ResPacket>(ResScalar(0)),
193
- c3 = pset1<ResPacket>(ResScalar(0));
194
-
195
- for(Index j=j2; j<jend; j+=1)
196
- {
197
- RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200
- c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201
- c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
178
+ if (i < n4) {
179
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
180
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
181
+
182
+ for (Index j = j2; j < jend; j += 1) {
183
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
184
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
185
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
186
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
187
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
202
188
  }
203
- pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204
- pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205
- pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206
- pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
189
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
190
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
191
+ pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
192
+ pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
207
193
 
208
- i+=ResPacketSize*4;
194
+ i += ResPacketSize * 4;
209
195
  }
210
- if(i<n3)
211
- {
212
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213
- c1 = pset1<ResPacket>(ResScalar(0)),
196
+ if (i < n3) {
197
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
214
198
  c2 = pset1<ResPacket>(ResScalar(0));
215
199
 
216
- for(Index j=j2; j<jend; j+=1)
217
- {
218
- RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221
- c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
200
+ for (Index j = j2; j < jend; j += 1) {
201
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
202
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
203
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
204
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
222
205
  }
223
- pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224
- pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225
- pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
207
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
208
+ pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
226
209
 
227
- i+=ResPacketSize*3;
210
+ i += ResPacketSize * 3;
228
211
  }
229
- if(i<n2)
230
- {
231
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232
- c1 = pset1<ResPacket>(ResScalar(0));
233
-
234
- for(Index j=j2; j<jend; j+=1)
235
- {
236
- RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
212
+ if (i < n2) {
213
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
214
+
215
+ for (Index j = j2; j < jend; j += 1) {
216
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
217
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
218
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
239
219
  }
240
- pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241
- pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
242
- i+=ResPacketSize*2;
220
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
221
+ pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
222
+ i += ResPacketSize * 2;
243
223
  }
244
- if(i<n1)
245
- {
224
+ if (i < n1) {
246
225
  ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247
- for(Index j=j2; j<jend; j+=1)
248
- {
249
- RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
226
+ for (Index j = j2; j < jend; j += 1) {
227
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
228
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
251
229
  }
252
- pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
253
- i+=ResPacketSize;
230
+ pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
231
+ i += ResPacketSize;
254
232
  }
255
- if(HasHalf && i<n_half)
256
- {
233
+ if (HasHalf && i < n_half) {
257
234
  ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258
- for(Index j=j2; j<jend; j+=1)
259
- {
260
- RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261
- c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
235
+ for (Index j = j2; j < jend; j += 1) {
236
+ RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
237
+ c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
262
238
  }
263
- pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264
- i+=ResPacketSizeHalf;
239
+ pstoreu(res + i + ResPacketSizeHalf * 0,
240
+ pmadd(c0, palpha_half, ploadu<ResPacketHalf>(res + i + ResPacketSizeHalf * 0)));
241
+ i += ResPacketSizeHalf;
265
242
  }
266
- if(HasQuarter && i<n_quarter)
267
- {
243
+ if (HasQuarter && i < n_quarter) {
268
244
  ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269
- for(Index j=j2; j<jend; j+=1)
270
- {
271
- RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272
- c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
245
+ for (Index j = j2; j < jend; j += 1) {
246
+ RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
247
+ c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
273
248
  }
274
- pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275
- i+=ResPacketSizeQuarter;
249
+ pstoreu(res + i + ResPacketSizeQuarter * 0,
250
+ pmadd(c0, palpha_quarter, ploadu<ResPacketQuarter>(res + i + ResPacketSizeQuarter * 0)));
251
+ i += ResPacketSizeQuarter;
276
252
  }
277
- for(;i<rows;++i)
278
- {
253
+ for (; i < rows; ++i) {
279
254
  ResScalar c0(0);
280
- for(Index j=j2; j<jend; j+=1)
281
- c0 += cj.pmul(lhs(i,j), rhs(j,0));
282
- res[i] += alpha*c0;
255
+ for (Index j = j2; j < jend; j += 1) c0 += cj.pmul(lhs(i, j), rhs(j, 0));
256
+ res[i] += alpha * c0;
283
257
  }
284
258
  }
285
259
  }
@@ -294,12 +268,13 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
294
268
  * - alpha is always a complex (or converted to a complex)
295
269
  * - no vectorization
296
270
  */
297
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
298
- struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
299
- {
300
- typedef gemv_traits<LhsScalar,RhsScalar> Traits;
301
- typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
302
- typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
271
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
272
+ typename RhsMapper, bool ConjugateRhs, int Version>
273
+ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,
274
+ ConjugateRhs, Version> {
275
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
276
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
277
+ typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
303
278
 
304
279
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
305
280
 
@@ -315,75 +290,69 @@ struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugat
315
290
  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
316
291
  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
317
292
 
318
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
319
- Index rows, Index cols,
320
- const LhsMapper& lhs,
321
- const RhsMapper& rhs,
322
- ResScalar* res, Index resIncr,
323
- ResScalar alpha);
293
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
294
+ const RhsMapper& rhs, ResScalar* res, Index resIncr,
295
+ ResScalar alpha);
324
296
  };
325
297
 
326
- template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
327
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
328
- Index rows, Index cols,
329
- const LhsMapper& alhs,
330
- const RhsMapper& rhs,
331
- ResScalar* res, Index resIncr,
332
- ResScalar alpha)
333
- {
298
+ template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
299
+ typename RhsMapper, bool ConjugateRhs, int Version>
300
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
301
+ general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
302
+ Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
303
+ ResScalar* res, Index resIncr, ResScalar alpha) {
334
304
  // The following copy tells the compiler that lhs's attributes are not modified outside this function
335
- // This helps GCC to generate propoer code.
305
+ // This helps GCC to generate proper code.
336
306
  LhsMapper lhs(alhs);
337
307
 
338
- eigen_internal_assert(rhs.stride()==1);
339
- conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
340
- conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
341
- conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
342
- conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
308
+ eigen_internal_assert(rhs.stride() == 1);
309
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
310
+ conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
311
+ conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
312
+ conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
343
313
 
344
314
  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
345
315
  // processing 8 rows at once might be counter productive wrt cache.
346
- const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
347
- const Index n4 = rows-3;
348
- const Index n2 = rows-1;
316
+ const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
317
+ const Index n4 = rows - 3;
318
+ const Index n2 = rows - 1;
349
319
 
350
320
  // TODO: for padded aligned inputs, we could enable aligned reads
351
- enum { LhsAlignment = Unaligned,
352
- ResPacketSize = Traits::ResPacketSize,
353
- ResPacketSizeHalf = HalfTraits::ResPacketSize,
354
- ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355
- LhsPacketSize = Traits::LhsPacketSize,
356
- LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357
- LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358
- HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
359
- HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
321
+ enum {
322
+ LhsAlignment = Unaligned,
323
+ ResPacketSize = Traits::ResPacketSize,
324
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
325
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
326
+ LhsPacketSize = Traits::LhsPacketSize,
327
+ LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
328
+ LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
329
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
330
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
360
331
  };
361
332
 
362
- Index i=0;
363
- for(; i<n8; i+=8)
364
- {
365
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
366
- c1 = pset1<ResPacket>(ResScalar(0)),
367
- c2 = pset1<ResPacket>(ResScalar(0)),
368
- c3 = pset1<ResPacket>(ResScalar(0)),
369
- c4 = pset1<ResPacket>(ResScalar(0)),
370
- c5 = pset1<ResPacket>(ResScalar(0)),
371
- c6 = pset1<ResPacket>(ResScalar(0)),
372
- c7 = pset1<ResPacket>(ResScalar(0));
373
-
374
- Index j=0;
375
- for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
376
- {
377
- RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
378
-
379
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
380
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
381
- c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
382
- c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
383
- c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
384
- c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
385
- c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
386
- c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
333
+ using UnsignedIndex = typename make_unsigned<Index>::type;
334
+ const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
335
+ const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
336
+ const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
337
+
338
+ Index i = 0;
339
+ for (; i < n8; i += 8) {
340
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
341
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
342
+ c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
343
+ c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
344
+
345
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
346
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
347
+
348
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
349
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
350
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
351
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
352
+ c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 4, j), b0, c4);
353
+ c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 5, j), b0, c5);
354
+ c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 6, j), b0, c6);
355
+ c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 7, j), b0, c7);
387
356
  }
388
357
  ResScalar cc0 = predux(c0);
389
358
  ResScalar cc1 = predux(c1);
@@ -393,126 +362,112 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
393
362
  ResScalar cc5 = predux(c5);
394
363
  ResScalar cc6 = predux(c6);
395
364
  ResScalar cc7 = predux(c7);
396
- for(; j<cols; ++j)
397
- {
398
- RhsScalar b0 = rhs(j,0);
399
-
400
- cc0 += cj.pmul(lhs(i+0,j), b0);
401
- cc1 += cj.pmul(lhs(i+1,j), b0);
402
- cc2 += cj.pmul(lhs(i+2,j), b0);
403
- cc3 += cj.pmul(lhs(i+3,j), b0);
404
- cc4 += cj.pmul(lhs(i+4,j), b0);
405
- cc5 += cj.pmul(lhs(i+5,j), b0);
406
- cc6 += cj.pmul(lhs(i+6,j), b0);
407
- cc7 += cj.pmul(lhs(i+7,j), b0);
365
+
366
+ for (Index j = fullColBlockEnd; j < cols; ++j) {
367
+ RhsScalar b0 = rhs(j, 0);
368
+
369
+ cc0 += cj.pmul(lhs(i + 0, j), b0);
370
+ cc1 += cj.pmul(lhs(i + 1, j), b0);
371
+ cc2 += cj.pmul(lhs(i + 2, j), b0);
372
+ cc3 += cj.pmul(lhs(i + 3, j), b0);
373
+ cc4 += cj.pmul(lhs(i + 4, j), b0);
374
+ cc5 += cj.pmul(lhs(i + 5, j), b0);
375
+ cc6 += cj.pmul(lhs(i + 6, j), b0);
376
+ cc7 += cj.pmul(lhs(i + 7, j), b0);
408
377
  }
409
- res[(i+0)*resIncr] += alpha*cc0;
410
- res[(i+1)*resIncr] += alpha*cc1;
411
- res[(i+2)*resIncr] += alpha*cc2;
412
- res[(i+3)*resIncr] += alpha*cc3;
413
- res[(i+4)*resIncr] += alpha*cc4;
414
- res[(i+5)*resIncr] += alpha*cc5;
415
- res[(i+6)*resIncr] += alpha*cc6;
416
- res[(i+7)*resIncr] += alpha*cc7;
378
+ res[(i + 0) * resIncr] += alpha * cc0;
379
+ res[(i + 1) * resIncr] += alpha * cc1;
380
+ res[(i + 2) * resIncr] += alpha * cc2;
381
+ res[(i + 3) * resIncr] += alpha * cc3;
382
+ res[(i + 4) * resIncr] += alpha * cc4;
383
+ res[(i + 5) * resIncr] += alpha * cc5;
384
+ res[(i + 6) * resIncr] += alpha * cc6;
385
+ res[(i + 7) * resIncr] += alpha * cc7;
417
386
  }
418
- for(; i<n4; i+=4)
419
- {
420
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
421
- c1 = pset1<ResPacket>(ResScalar(0)),
422
- c2 = pset1<ResPacket>(ResScalar(0)),
423
- c3 = pset1<ResPacket>(ResScalar(0));
424
-
425
- Index j=0;
426
- for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
427
- {
428
- RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
429
-
430
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
431
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
432
- c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
433
- c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
387
+ for (; i < n4; i += 4) {
388
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
389
+ c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
390
+
391
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
392
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
393
+
394
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
395
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
396
+ c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
397
+ c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
434
398
  }
435
399
  ResScalar cc0 = predux(c0);
436
400
  ResScalar cc1 = predux(c1);
437
401
  ResScalar cc2 = predux(c2);
438
402
  ResScalar cc3 = predux(c3);
439
- for(; j<cols; ++j)
440
- {
441
- RhsScalar b0 = rhs(j,0);
442
-
443
- cc0 += cj.pmul(lhs(i+0,j), b0);
444
- cc1 += cj.pmul(lhs(i+1,j), b0);
445
- cc2 += cj.pmul(lhs(i+2,j), b0);
446
- cc3 += cj.pmul(lhs(i+3,j), b0);
403
+
404
+ for (Index j = fullColBlockEnd; j < cols; ++j) {
405
+ RhsScalar b0 = rhs(j, 0);
406
+
407
+ cc0 += cj.pmul(lhs(i + 0, j), b0);
408
+ cc1 += cj.pmul(lhs(i + 1, j), b0);
409
+ cc2 += cj.pmul(lhs(i + 2, j), b0);
410
+ cc3 += cj.pmul(lhs(i + 3, j), b0);
447
411
  }
448
- res[(i+0)*resIncr] += alpha*cc0;
449
- res[(i+1)*resIncr] += alpha*cc1;
450
- res[(i+2)*resIncr] += alpha*cc2;
451
- res[(i+3)*resIncr] += alpha*cc3;
412
+ res[(i + 0) * resIncr] += alpha * cc0;
413
+ res[(i + 1) * resIncr] += alpha * cc1;
414
+ res[(i + 2) * resIncr] += alpha * cc2;
415
+ res[(i + 3) * resIncr] += alpha * cc3;
452
416
  }
453
- for(; i<n2; i+=2)
454
- {
455
- ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
456
- c1 = pset1<ResPacket>(ResScalar(0));
457
-
458
- Index j=0;
459
- for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
460
- {
461
- RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
462
-
463
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
464
- c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
417
+ for (; i < n2; i += 2) {
418
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
419
+
420
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
421
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
422
+
423
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
424
+ c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
465
425
  }
466
426
  ResScalar cc0 = predux(c0);
467
427
  ResScalar cc1 = predux(c1);
468
- for(; j<cols; ++j)
469
- {
470
- RhsScalar b0 = rhs(j,0);
471
428
 
472
- cc0 += cj.pmul(lhs(i+0,j), b0);
473
- cc1 += cj.pmul(lhs(i+1,j), b0);
429
+ for (Index j = fullColBlockEnd; j < cols; ++j) {
430
+ RhsScalar b0 = rhs(j, 0);
431
+
432
+ cc0 += cj.pmul(lhs(i + 0, j), b0);
433
+ cc1 += cj.pmul(lhs(i + 1, j), b0);
474
434
  }
475
- res[(i+0)*resIncr] += alpha*cc0;
476
- res[(i+1)*resIncr] += alpha*cc1;
435
+ res[(i + 0) * resIncr] += alpha * cc0;
436
+ res[(i + 1) * resIncr] += alpha * cc1;
477
437
  }
478
- for(; i<rows; ++i)
479
- {
438
+ for (; i < rows; ++i) {
480
439
  ResPacket c0 = pset1<ResPacket>(ResScalar(0));
481
440
  ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
482
441
  ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
483
- Index j=0;
484
- for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
485
- {
486
- RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
487
- c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
442
+
443
+ for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
444
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
445
+ c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i, j), b0, c0);
488
446
  }
489
447
  ResScalar cc0 = predux(c0);
490
448
  if (HasHalf) {
491
- for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
492
- {
493
- RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
494
- c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
495
- }
449
+ for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
450
+ RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
451
+ c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i, j), b0, c0_h);
452
+ }
496
453
  cc0 += predux(c0_h);
497
454
  }
498
455
  if (HasQuarter) {
499
- for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
500
- {
501
- RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
502
- c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
503
- }
456
+ for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
457
+ RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
458
+ c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i, j), b0, c0_q);
459
+ }
504
460
  cc0 += predux(c0_q);
505
461
  }
506
- for(; j<cols; ++j)
507
- {
508
- cc0 += cj.pmul(lhs(i,j), rhs(j,0));
462
+ for (Index j = quarterColBlockEnd; j < cols; ++j) {
463
+ cc0 += cj.pmul(lhs(i, j), rhs(j, 0));
509
464
  }
510
- res[i*resIncr] += alpha*cc0;
465
+ res[i * resIncr] += alpha * cc0;
511
466
  }
512
467
  }
513
468
 
514
- } // end namespace internal
469
+ } // end namespace internal
515
470
 
516
- } // end namespace Eigen
471
+ } // end namespace Eigen
517
472
 
518
- #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
473
+ #endif // EIGEN_GENERAL_MATRIX_VECTOR_H