@smake/eigen 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (287) hide show
  1. package/README.md +1 -1
  2. package/eigen/COPYING.APACHE +203 -0
  3. package/eigen/COPYING.BSD +26 -0
  4. package/eigen/COPYING.GPL +674 -0
  5. package/eigen/COPYING.LGPL +502 -0
  6. package/eigen/COPYING.MINPACK +51 -0
  7. package/eigen/COPYING.MPL2 +373 -0
  8. package/eigen/COPYING.README +18 -0
  9. package/eigen/Eigen/Cholesky +0 -1
  10. package/eigen/Eigen/Core +108 -266
  11. package/eigen/Eigen/Eigenvalues +0 -1
  12. package/eigen/Eigen/Geometry +3 -6
  13. package/eigen/Eigen/Householder +0 -1
  14. package/eigen/Eigen/Jacobi +0 -1
  15. package/eigen/Eigen/KLUSupport +41 -0
  16. package/eigen/Eigen/LU +2 -5
  17. package/eigen/Eigen/OrderingMethods +0 -3
  18. package/eigen/Eigen/PaStiXSupport +1 -0
  19. package/eigen/Eigen/PardisoSupport +0 -0
  20. package/eigen/Eigen/QR +0 -1
  21. package/eigen/Eigen/QtAlignedMalloc +0 -1
  22. package/eigen/Eigen/SVD +0 -1
  23. package/eigen/Eigen/Sparse +0 -2
  24. package/eigen/Eigen/SparseCholesky +0 -8
  25. package/eigen/Eigen/SparseLU +4 -0
  26. package/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  27. package/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  28. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  29. package/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  30. package/eigen/Eigen/src/Core/Array.h +99 -11
  31. package/eigen/Eigen/src/Core/ArrayBase.h +1 -1
  32. package/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  33. package/eigen/Eigen/src/Core/Assign.h +1 -1
  34. package/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  35. package/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  36. package/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  37. package/eigen/Eigen/src/Core/Block.h +56 -60
  38. package/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  39. package/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  40. package/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  41. package/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  42. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  43. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  44. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  45. package/eigen/Eigen/src/Core/CwiseUnaryView.h +12 -10
  46. package/eigen/Eigen/src/Core/DenseBase.h +128 -39
  47. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  48. package/eigen/Eigen/src/Core/DenseStorage.h +150 -68
  49. package/eigen/Eigen/src/Core/Diagonal.h +21 -23
  50. package/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  51. package/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  52. package/eigen/Eigen/src/Core/Dot.h +10 -10
  53. package/eigen/Eigen/src/Core/EigenBase.h +10 -9
  54. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  55. package/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  56. package/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  57. package/eigen/Eigen/src/Core/GenericPacketMath.h +597 -147
  58. package/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  59. package/eigen/Eigen/src/Core/IO.h +40 -7
  60. package/eigen/Eigen/src/Core/IndexedView.h +237 -0
  61. package/eigen/Eigen/src/Core/Inverse.h +9 -10
  62. package/eigen/Eigen/src/Core/Map.h +7 -7
  63. package/eigen/Eigen/src/Core/MapBase.h +5 -3
  64. package/eigen/Eigen/src/Core/MathFunctions.h +756 -120
  65. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  66. package/eigen/Eigen/src/Core/Matrix.h +131 -25
  67. package/eigen/Eigen/src/Core/MatrixBase.h +19 -2
  68. package/eigen/Eigen/src/Core/NestByValue.h +25 -50
  69. package/eigen/Eigen/src/Core/NoAlias.h +4 -3
  70. package/eigen/Eigen/src/Core/NumTraits.h +107 -20
  71. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  72. package/eigen/Eigen/src/Core/PermutationMatrix.h +3 -3
  73. package/eigen/Eigen/src/Core/PlainObjectBase.h +145 -54
  74. package/eigen/Eigen/src/Core/Product.h +30 -25
  75. package/eigen/Eigen/src/Core/ProductEvaluators.h +183 -142
  76. package/eigen/Eigen/src/Core/Random.h +37 -1
  77. package/eigen/Eigen/src/Core/Redux.h +180 -170
  78. package/eigen/Eigen/src/Core/Ref.h +118 -21
  79. package/eigen/Eigen/src/Core/Replicate.h +8 -8
  80. package/eigen/Eigen/src/Core/Reshaped.h +454 -0
  81. package/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  82. package/eigen/Eigen/src/Core/Reverse.h +18 -12
  83. package/eigen/Eigen/src/Core/Select.h +8 -6
  84. package/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  85. package/eigen/Eigen/src/Core/Solve.h +14 -14
  86. package/eigen/Eigen/src/Core/SolveTriangular.h +13 -13
  87. package/eigen/Eigen/src/Core/SolverBase.h +41 -3
  88. package/eigen/Eigen/src/Core/StableNorm.h +100 -70
  89. package/eigen/Eigen/src/Core/StlIterators.h +463 -0
  90. package/eigen/Eigen/src/Core/Stride.h +9 -4
  91. package/eigen/Eigen/src/Core/Swap.h +5 -4
  92. package/eigen/Eigen/src/Core/Transpose.h +86 -27
  93. package/eigen/Eigen/src/Core/Transpositions.h +26 -8
  94. package/eigen/Eigen/src/Core/TriangularMatrix.h +88 -72
  95. package/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  96. package/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  97. package/eigen/Eigen/src/Core/Visitor.h +137 -29
  98. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  99. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  100. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  101. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  102. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  103. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +186 -213
  104. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1250 -252
  105. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  106. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  107. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  108. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  109. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  110. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  111. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  112. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  113. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  114. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  115. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  116. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  117. package/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  118. package/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  119. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  120. package/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  121. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  122. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  123. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  124. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  125. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  126. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  127. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  128. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  129. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  130. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  131. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  132. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  133. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  134. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  135. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  136. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  137. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  138. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  139. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  140. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  141. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  142. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  143. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  144. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  145. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  146. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  147. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  148. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  149. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  150. package/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  151. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +354 -15
  152. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1073 -585
  153. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +29 -7
  154. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +4 -4
  155. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +1 -1
  156. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  157. package/eigen/Eigen/src/Core/products/Parallelizer.h +23 -9
  158. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +23 -6
  159. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  160. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +2 -2
  161. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  162. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +3 -3
  163. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +5 -3
  164. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  165. package/eigen/Eigen/src/Core/util/BlasUtil.h +208 -124
  166. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  167. package/eigen/Eigen/src/Core/util/Constants.h +25 -9
  168. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +14 -2
  169. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +28 -4
  170. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  171. package/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  172. package/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  173. package/eigen/Eigen/src/Core/util/Macros.h +661 -250
  174. package/eigen/Eigen/src/Core/util/Memory.h +222 -52
  175. package/eigen/Eigen/src/Core/util/Meta.h +349 -105
  176. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  177. package/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  178. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  179. package/eigen/Eigen/src/Core/util/XprHelper.h +48 -30
  180. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  181. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +1 -1
  182. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  183. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  184. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  185. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  186. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  187. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +10 -5
  188. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +75 -42
  189. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  190. package/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  191. package/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  192. package/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  193. package/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  194. package/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  195. package/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  196. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  197. package/eigen/Eigen/src/Geometry/Quaternion.h +52 -14
  198. package/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  199. package/eigen/Eigen/src/Geometry/Scaling.h +22 -4
  200. package/eigen/Eigen/src/Geometry/Transform.h +86 -65
  201. package/eigen/Eigen/src/Geometry/Translation.h +6 -6
  202. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  203. package/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  204. package/eigen/Eigen/src/Householder/Householder.h +8 -4
  205. package/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  206. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  207. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  208. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  209. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  210. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  211. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  212. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  213. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  214. package/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  215. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  216. package/eigen/Eigen/src/LU/Determinant.h +35 -19
  217. package/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  218. package/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  219. package/eigen/Eigen/src/LU/PartialPivLU.h +67 -57
  220. package/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  221. package/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  222. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  223. package/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  224. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  225. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +10 -9
  226. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  227. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  228. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  229. package/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  230. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  231. package/eigen/Eigen/src/SVD/BDCSVD.h +137 -48
  232. package/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  233. package/eigen/Eigen/src/SVD/SVDBase.h +82 -21
  234. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  235. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +16 -8
  236. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +11 -36
  237. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  238. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  239. package/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  240. package/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  241. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  242. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  243. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +2 -2
  244. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  245. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +124 -10
  246. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  247. package/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  248. package/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  249. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +3 -0
  250. package/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  251. package/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  252. package/eigen/Eigen/src/SparseLU/SparseLU.h +160 -10
  253. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  254. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  255. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  256. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  257. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  258. package/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  259. package/eigen/Eigen/src/StlSupport/StdDeque.h +2 -14
  260. package/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  261. package/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  262. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  263. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  264. package/eigen/Eigen/src/misc/lapacke.h +5 -4
  265. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +27 -1
  266. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  267. package/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  268. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  269. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  270. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  271. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  272. package/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  273. package/eigen/README.md +5 -0
  274. package/lib/LibEigen.d.ts +4 -0
  275. package/lib/LibEigen.js +14 -0
  276. package/lib/index.d.ts +1 -1
  277. package/lib/index.js +7 -3
  278. package/package.json +2 -10
  279. package/eigen/Eigen/CMakeLists.txt +0 -19
  280. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  281. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  282. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  283. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  284. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  285. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  286. package/lib/eigen.d.ts +0 -2
  287. package/lib/eigen.js +0 -15
@@ -1,7 +1,7 @@
1
1
  // This file is part of Eigen, a lightweight C++ template library
2
2
  // for linear algebra.
3
3
  //
4
- // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
4
+ // Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
5
5
  //
6
6
  // This Source Code Form is subject to the terms of the Mozilla
7
7
  // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,11 +14,57 @@ namespace Eigen {
14
14
 
15
15
  namespace internal {
16
16
 
17
+ enum GEMVPacketSizeType {
18
+ GEMVPacketFull = 0,
19
+ GEMVPacketHalf,
20
+ GEMVPacketQuarter
21
+ };
22
+
23
+ template <int N, typename T1, typename T2, typename T3>
24
+ struct gemv_packet_cond { typedef T3 type; };
25
+
26
+ template <typename T1, typename T2, typename T3>
27
+ struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
28
+
29
+ template <typename T1, typename T2, typename T3>
30
+ struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
31
+
32
+ template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
33
+ class gemv_traits
34
+ {
35
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
36
+
37
+ #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
38
+ typedef typename gemv_packet_cond<packet_size, \
39
+ typename packet_traits<name ## Scalar>::type, \
40
+ typename packet_traits<name ## Scalar>::half, \
41
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42
+ prefix ## name ## Packet
43
+
44
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47
+ #undef PACKET_DECL_COND_PREFIX
48
+
49
+ public:
50
+ enum {
51
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
52
+ unpacket_traits<_RhsPacket>::vectorizable &&
53
+ int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
54
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
55
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
56
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
57
+ };
58
+
59
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
60
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
61
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
62
+ };
63
+
64
+
17
65
  /* Optimized col-major matrix * vector product:
18
- * This algorithm processes 4 columns at onces that allows to both reduce
19
- * the number of load/stores of the result by a factor 4 and to reduce
20
- * the instruction dependency. Moreover, we know that all bands have the
21
- * same alignment pattern.
66
+ * This algorithm processes the matrix per vertical panels,
67
+ * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
22
68
  *
23
69
  * Mixing type logic: C += alpha * A * B
24
70
  * | A | B |alpha| comments
@@ -27,56 +73,30 @@ namespace internal {
27
73
  * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
28
74
  * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
29
75
  *
30
- * Accesses to the matrix coefficients follow the following logic:
31
- *
32
- * - if all columns have the same alignment then
33
- * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
34
- * - otherwise perform unaligned loads only (-> NoneAligned case)
35
- * - otherwise
36
- * - if even columns have the same alignment then
37
- * // odd columns are guaranteed to have the same alignment too
38
- * - if even or odd columns have the same alignment as the result, then
39
- * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
40
- * - perform half aligned and half unaligned loads (-> EvenAligned case)
41
- * - otherwise perform unaligned loads only (-> NoneAligned case)
42
- * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
43
- * - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
44
- * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
45
- * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
46
- * - otherwise,
47
- * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
48
- * // we currently fall back to the NoneAligned case
49
- *
50
76
  * The same reasoning apply for the transposed case.
51
- *
52
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
53
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
54
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
55
- * compared to unaligned loads on a 4 byte boundary.
56
- *
57
77
  */
58
78
  template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
59
79
  struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
60
80
  {
81
+ typedef gemv_traits<LhsScalar,RhsScalar> Traits;
82
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
83
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
84
+
61
85
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
62
86
 
63
- enum {
64
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
65
- && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
66
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
67
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
68
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
69
- };
87
+ typedef typename Traits::LhsPacket LhsPacket;
88
+ typedef typename Traits::RhsPacket RhsPacket;
89
+ typedef typename Traits::ResPacket ResPacket;
70
90
 
71
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
72
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
73
- typedef typename packet_traits<ResScalar>::type _ResPacket;
91
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
92
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
93
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
74
94
 
75
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
76
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
77
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
95
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
96
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
97
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
78
98
 
79
- EIGEN_DONT_INLINE static void run(
99
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
80
100
  Index rows, Index cols,
81
101
  const LhsMapper& lhs,
82
102
  const RhsMapper& rhs,
@@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run(
85
105
  };
86
106
 
87
107
  template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
88
- EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
108
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
89
109
  Index rows, Index cols,
90
- const LhsMapper& lhs,
110
+ const LhsMapper& alhs,
91
111
  const RhsMapper& rhs,
92
112
  ResScalar* res, Index resIncr,
93
113
  RhsScalar alpha)
94
114
  {
95
115
  EIGEN_UNUSED_VARIABLE(resIncr);
96
116
  eigen_internal_assert(resIncr==1);
97
- #ifdef _EIGEN_ACCUMULATE_PACKETS
98
- #error _EIGEN_ACCUMULATE_PACKETS has already been defined
99
- #endif
100
- #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
101
- pstore(&res[j], \
102
- padd(pload<ResPacket>(&res[j]), \
103
- padd( \
104
- padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
105
- pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
106
- padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
107
- pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
108
-
109
- typedef typename LhsMapper::VectorMapper LhsScalars;
117
+
118
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
119
+ // This helps GCC to generate propoer code.
120
+ LhsMapper lhs(alhs);
110
121
 
111
122
  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
112
123
  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
113
- if(ConjugateRhs)
114
- alpha = numext::conj(alpha);
115
-
116
- enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
117
- const Index columnsAtOnce = 4;
118
- const Index peels = 2;
119
- const Index LhsPacketAlignedMask = LhsPacketSize-1;
120
- const Index ResPacketAlignedMask = ResPacketSize-1;
121
- // const Index PeelAlignedMask = ResPacketSize*peels-1;
122
- const Index size = rows;
124
+ conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
125
+ conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
123
126
 
124
127
  const Index lhsStride = lhs.stride();
125
-
126
- // How many coeffs of the result do we have to skip to be aligned.
127
- // Here we assume data are at least aligned on the base scalar type.
128
- Index alignedStart = internal::first_default_aligned(res,size);
129
- Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
130
- const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
131
-
132
- const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
133
- Index alignmentPattern = alignmentStep==0 ? AllAligned
134
- : alignmentStep==(LhsPacketSize/2) ? EvenAligned
135
- : FirstAligned;
136
-
137
- // we cannot assume the first element is aligned because of sub-matrices
138
- const Index lhsAlignmentOffset = lhs.firstAligned(size);
139
-
140
- // find how many columns do we have to skip to be aligned with the result (if possible)
141
- Index skipColumns = 0;
142
- // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
143
- if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
144
- {
145
- alignedSize = 0;
146
- alignedStart = 0;
147
- alignmentPattern = NoneAligned;
148
- }
149
- else if(LhsPacketSize > 4)
150
- {
151
- // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
152
- // Currently, it seems to be better to perform unaligned loads anyway
153
- alignmentPattern = NoneAligned;
154
- }
155
- else if (LhsPacketSize>1)
128
+ // TODO: for padded aligned inputs, we could enable aligned reads
129
+ enum { LhsAlignment = Unaligned,
130
+ ResPacketSize = Traits::ResPacketSize,
131
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
132
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133
+ LhsPacketSize = Traits::LhsPacketSize,
134
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
135
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
136
+ };
137
+
138
+ const Index n8 = rows-8*ResPacketSize+1;
139
+ const Index n4 = rows-4*ResPacketSize+1;
140
+ const Index n3 = rows-3*ResPacketSize+1;
141
+ const Index n2 = rows-2*ResPacketSize+1;
142
+ const Index n1 = rows-1*ResPacketSize+1;
143
+ const Index n_half = rows-1*ResPacketSizeHalf+1;
144
+ const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
145
+
146
+ // TODO: improve the following heuristic:
147
+ const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
148
+ ResPacket palpha = pset1<ResPacket>(alpha);
149
+ ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150
+ ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
151
+
152
+ for(Index j2=0; j2<cols; j2+=block_cols)
156
153
  {
157
- // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
158
-
159
- while (skipColumns<LhsPacketSize &&
160
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
161
- ++skipColumns;
162
- if (skipColumns==LhsPacketSize)
154
+ Index jend = numext::mini(j2+block_cols,cols);
155
+ Index i=0;
156
+ for(; i<n8; i+=ResPacketSize*8)
163
157
  {
164
- // nothing can be aligned, no need to skip any column
165
- alignmentPattern = NoneAligned;
166
- skipColumns = 0;
158
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159
+ c1 = pset1<ResPacket>(ResScalar(0)),
160
+ c2 = pset1<ResPacket>(ResScalar(0)),
161
+ c3 = pset1<ResPacket>(ResScalar(0)),
162
+ c4 = pset1<ResPacket>(ResScalar(0)),
163
+ c5 = pset1<ResPacket>(ResScalar(0)),
164
+ c6 = pset1<ResPacket>(ResScalar(0)),
165
+ c7 = pset1<ResPacket>(ResScalar(0));
166
+
167
+ for(Index j=j2; j<jend; j+=1)
168
+ {
169
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174
+ c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175
+ c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176
+ c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177
+ c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
178
+ }
179
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182
+ pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183
+ pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184
+ pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185
+ pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186
+ pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
167
187
  }
168
- else
188
+ if(i<n4)
169
189
  {
170
- skipColumns = (std::min)(skipColumns,cols);
171
- // note that the skiped columns are processed later.
172
- }
190
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191
+ c1 = pset1<ResPacket>(ResScalar(0)),
192
+ c2 = pset1<ResPacket>(ResScalar(0)),
193
+ c3 = pset1<ResPacket>(ResScalar(0));
173
194
 
174
- /* eigen_internal_assert( (alignmentPattern==NoneAligned)
175
- || (skipColumns + columnsAtOnce >= cols)
176
- || LhsPacketSize > size
177
- || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
178
- }
179
- else if(Vectorizable)
180
- {
181
- alignedStart = 0;
182
- alignedSize = size;
183
- alignmentPattern = AllAligned;
184
- }
185
-
186
- const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
187
- const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
195
+ for(Index j=j2; j<jend; j+=1)
196
+ {
197
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
202
+ }
203
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206
+ pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
188
207
 
189
- Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
190
- for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
191
- {
192
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
193
- ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
194
- ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
195
- ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
208
+ i+=ResPacketSize*4;
209
+ }
210
+ if(i<n3)
211
+ {
212
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213
+ c1 = pset1<ResPacket>(ResScalar(0)),
214
+ c2 = pset1<ResPacket>(ResScalar(0));
196
215
 
197
- // this helps a lot generating better binary code
198
- const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
199
- lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
216
+ for(Index j=j2; j<jend; j+=1)
217
+ {
218
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
222
+ }
223
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
200
226
 
201
- if (Vectorizable)
227
+ i+=ResPacketSize*3;
228
+ }
229
+ if(i<n2)
202
230
  {
203
- /* explicit vectorization */
204
- // process initial unaligned coeffs
205
- for (Index j=0; j<alignedStart; ++j)
231
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232
+ c1 = pset1<ResPacket>(ResScalar(0));
233
+
234
+ for(Index j=j2; j<jend; j+=1)
206
235
  {
207
- res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
208
- res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
209
- res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
210
- res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
236
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
211
239
  }
212
-
213
- if (alignedSize>alignedStart)
240
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
242
+ i+=ResPacketSize*2;
243
+ }
244
+ if(i<n1)
245
+ {
246
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247
+ for(Index j=j2; j<jend; j+=1)
214
248
  {
215
- switch(alignmentPattern)
216
- {
217
- case AllAligned:
218
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
219
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
220
- break;
221
- case EvenAligned:
222
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
223
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
224
- break;
225
- case FirstAligned:
226
- {
227
- Index j = alignedStart;
228
- if(peels>1)
229
- {
230
- LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
231
- ResPacket T0, T1;
232
-
233
- A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
234
- A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
235
- A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
236
-
237
- for (; j<peeledSize; j+=peels*ResPacketSize)
238
- {
239
- A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
240
- A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
241
- A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
242
-
243
- A00 = lhs0.template load<LhsPacket, Aligned>(j);
244
- A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
245
- T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
246
- T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
247
-
248
- T0 = pcj.pmadd(A01, ptmp1, T0);
249
- A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
250
- T0 = pcj.pmadd(A02, ptmp2, T0);
251
- A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
252
- T0 = pcj.pmadd(A03, ptmp3, T0);
253
- pstore(&res[j],T0);
254
- A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
255
- T1 = pcj.pmadd(A11, ptmp1, T1);
256
- T1 = pcj.pmadd(A12, ptmp2, T1);
257
- T1 = pcj.pmadd(A13, ptmp3, T1);
258
- pstore(&res[j+ResPacketSize],T1);
259
- }
260
- }
261
- for (; j<alignedSize; j+=ResPacketSize)
262
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
263
- break;
264
- }
265
- default:
266
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
267
- _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
268
- break;
269
- }
249
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
270
251
  }
271
- } // end explicit vectorization
272
-
273
- /* process remaining coeffs (or all if there is no explicit vectorization) */
274
- for (Index j=alignedSize; j<size; ++j)
252
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
253
+ i+=ResPacketSize;
254
+ }
255
+ if(HasHalf && i<n_half)
275
256
  {
276
- res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
277
- res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
278
- res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
279
- res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
257
+ ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258
+ for(Index j=j2; j<jend; j+=1)
259
+ {
260
+ RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261
+ c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
262
+ }
263
+ pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264
+ i+=ResPacketSizeHalf;
280
265
  }
281
- }
282
-
283
- // process remaining first and last columns (at most columnsAtOnce-1)
284
- Index end = cols;
285
- Index start = columnBound;
286
- do
287
- {
288
- for (Index k=start; k<end; ++k)
266
+ if(HasQuarter && i<n_quarter)
289
267
  {
290
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
291
- const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
292
-
293
- if (Vectorizable)
268
+ ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269
+ for(Index j=j2; j<jend; j+=1)
294
270
  {
295
- /* explicit vectorization */
296
- // process first unaligned result's coeffs
297
- for (Index j=0; j<alignedStart; ++j)
298
- res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
299
- // process aligned result's coeffs
300
- if (lhs0.template aligned<LhsPacket>(alignedStart))
301
- for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
302
- pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
303
- else
304
- for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
305
- pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
271
+ RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272
+ c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
306
273
  }
307
-
308
- // process remaining scalars (or all if no explicit vectorization)
309
- for (Index i=alignedSize; i<size; ++i)
310
- res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
274
+ pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275
+ i+=ResPacketSizeQuarter;
311
276
  }
312
- if (skipColumns)
277
+ for(;i<rows;++i)
313
278
  {
314
- start = 0;
315
- end = skipColumns;
316
- skipColumns = 0;
279
+ ResScalar c0(0);
280
+ for(Index j=j2; j<jend; j+=1)
281
+ c0 += cj.pmul(lhs(i,j), rhs(j,0));
282
+ res[i] += alpha*c0;
317
283
  }
318
- else
319
- break;
320
- } while(Vectorizable);
321
- #undef _EIGEN_ACCUMULATE_PACKETS
284
+ }
322
285
  }
323
286
 
324
287
  /* Optimized row-major matrix * vector product:
325
- * This algorithm processes 4 rows at onces that allows to both reduce
288
+ * This algorithm processes 4 rows at once that allows to both reduce
326
289
  * the number of load/stores of the result by a factor 4 and to reduce
327
290
  * the instruction dependency. Moreover, we know that all bands have the
328
291
  * same alignment pattern.
@@ -334,25 +297,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
334
297
  template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
335
298
  struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
336
299
  {
337
- typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
338
-
339
- enum {
340
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
341
- && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
342
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
343
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
344
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
345
- };
300
+ typedef gemv_traits<LhsScalar,RhsScalar> Traits;
301
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
302
+ typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
303
+
304
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
305
+
306
+ typedef typename Traits::LhsPacket LhsPacket;
307
+ typedef typename Traits::RhsPacket RhsPacket;
308
+ typedef typename Traits::ResPacket ResPacket;
346
309
 
347
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
348
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
349
- typedef typename packet_traits<ResScalar>::type _ResPacket;
310
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
311
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
312
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
350
313
 
351
- typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
352
- typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
353
- typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
314
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
315
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
316
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
354
317
 
355
- EIGEN_DONT_INLINE static void run(
318
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
356
319
  Index rows, Index cols,
357
320
  const LhsMapper& lhs,
358
321
  const RhsMapper& rhs,
@@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run(
361
324
  };
362
325
 
363
326
  template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
364
- EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
327
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
365
328
  Index rows, Index cols,
366
- const LhsMapper& lhs,
329
+ const LhsMapper& alhs,
367
330
  const RhsMapper& rhs,
368
331
  ResScalar* res, Index resIncr,
369
332
  ResScalar alpha)
370
333
  {
371
- eigen_internal_assert(rhs.stride()==1);
372
-
373
- #ifdef _EIGEN_ACCUMULATE_PACKETS
374
- #error _EIGEN_ACCUMULATE_PACKETS has already been defined
375
- #endif
376
-
377
- #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
378
- RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
379
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
380
- ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
381
- ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
382
- ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
334
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
335
+ // This helps GCC to generate propoer code.
336
+ LhsMapper lhs(alhs);
383
337
 
338
+ eigen_internal_assert(rhs.stride()==1);
384
339
  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
385
340
  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
386
-
387
- typedef typename LhsMapper::VectorMapper LhsScalars;
388
-
389
- enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
390
- const Index rowsAtOnce = 4;
391
- const Index peels = 2;
392
- const Index RhsPacketAlignedMask = RhsPacketSize-1;
393
- const Index LhsPacketAlignedMask = LhsPacketSize-1;
394
- const Index depth = cols;
395
- const Index lhsStride = lhs.stride();
396
-
397
- // How many coeffs of the result do we have to skip to be aligned.
398
- // Here we assume data are at least aligned on the base scalar type
399
- // if that's not the case then vectorization is discarded, see below.
400
- Index alignedStart = rhs.firstAligned(depth);
401
- Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
402
- const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
403
-
404
- const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
405
- Index alignmentPattern = alignmentStep==0 ? AllAligned
406
- : alignmentStep==(LhsPacketSize/2) ? EvenAligned
407
- : FirstAligned;
408
-
409
- // we cannot assume the first element is aligned because of sub-matrices
410
- const Index lhsAlignmentOffset = lhs.firstAligned(depth);
411
- const Index rhsAlignmentOffset = rhs.firstAligned(rows);
412
-
413
- // find how many rows do we have to skip to be aligned with rhs (if possible)
414
- Index skipRows = 0;
415
- // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
416
- if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
417
- (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
418
- (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
419
- {
420
- alignedSize = 0;
421
- alignedStart = 0;
422
- alignmentPattern = NoneAligned;
423
- }
424
- else if(LhsPacketSize > 4)
425
- {
426
- // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
427
- alignmentPattern = NoneAligned;
428
- }
429
- else if (LhsPacketSize>1)
341
+ conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
342
+ conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
343
+
344
+ // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
345
+ // processing 8 rows at once might be counter productive wrt cache.
346
+ const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
347
+ const Index n4 = rows-3;
348
+ const Index n2 = rows-1;
349
+
350
+ // TODO: for padded aligned inputs, we could enable aligned reads
351
+ enum { LhsAlignment = Unaligned,
352
+ ResPacketSize = Traits::ResPacketSize,
353
+ ResPacketSizeHalf = HalfTraits::ResPacketSize,
354
+ ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355
+ LhsPacketSize = Traits::LhsPacketSize,
356
+ LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357
+ LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358
+ HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
359
+ HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
360
+ };
361
+
362
+ Index i=0;
363
+ for(; i<n8; i+=8)
430
364
  {
431
- // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
432
-
433
- while (skipRows<LhsPacketSize &&
434
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
435
- ++skipRows;
436
- if (skipRows==LhsPacketSize)
365
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
366
+ c1 = pset1<ResPacket>(ResScalar(0)),
367
+ c2 = pset1<ResPacket>(ResScalar(0)),
368
+ c3 = pset1<ResPacket>(ResScalar(0)),
369
+ c4 = pset1<ResPacket>(ResScalar(0)),
370
+ c5 = pset1<ResPacket>(ResScalar(0)),
371
+ c6 = pset1<ResPacket>(ResScalar(0)),
372
+ c7 = pset1<ResPacket>(ResScalar(0));
373
+
374
+ Index j=0;
375
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
437
376
  {
438
- // nothing can be aligned, no need to skip any column
439
- alignmentPattern = NoneAligned;
440
- skipRows = 0;
377
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
378
+
379
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
380
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
381
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
382
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
383
+ c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
384
+ c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
385
+ c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
386
+ c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
441
387
  }
442
- else
388
+ ResScalar cc0 = predux(c0);
389
+ ResScalar cc1 = predux(c1);
390
+ ResScalar cc2 = predux(c2);
391
+ ResScalar cc3 = predux(c3);
392
+ ResScalar cc4 = predux(c4);
393
+ ResScalar cc5 = predux(c5);
394
+ ResScalar cc6 = predux(c6);
395
+ ResScalar cc7 = predux(c7);
396
+ for(; j<cols; ++j)
443
397
  {
444
- skipRows = (std::min)(skipRows,Index(rows));
445
- // note that the skiped columns are processed later.
398
+ RhsScalar b0 = rhs(j,0);
399
+
400
+ cc0 += cj.pmul(lhs(i+0,j), b0);
401
+ cc1 += cj.pmul(lhs(i+1,j), b0);
402
+ cc2 += cj.pmul(lhs(i+2,j), b0);
403
+ cc3 += cj.pmul(lhs(i+3,j), b0);
404
+ cc4 += cj.pmul(lhs(i+4,j), b0);
405
+ cc5 += cj.pmul(lhs(i+5,j), b0);
406
+ cc6 += cj.pmul(lhs(i+6,j), b0);
407
+ cc7 += cj.pmul(lhs(i+7,j), b0);
446
408
  }
447
- /* eigen_internal_assert( alignmentPattern==NoneAligned
448
- || LhsPacketSize==1
449
- || (skipRows + rowsAtOnce >= rows)
450
- || LhsPacketSize > depth
451
- || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
409
+ res[(i+0)*resIncr] += alpha*cc0;
410
+ res[(i+1)*resIncr] += alpha*cc1;
411
+ res[(i+2)*resIncr] += alpha*cc2;
412
+ res[(i+3)*resIncr] += alpha*cc3;
413
+ res[(i+4)*resIncr] += alpha*cc4;
414
+ res[(i+5)*resIncr] += alpha*cc5;
415
+ res[(i+6)*resIncr] += alpha*cc6;
416
+ res[(i+7)*resIncr] += alpha*cc7;
452
417
  }
453
- else if(Vectorizable)
418
+ for(; i<n4; i+=4)
454
419
  {
455
- alignedStart = 0;
456
- alignedSize = depth;
457
- alignmentPattern = AllAligned;
458
- }
459
-
460
- const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
461
- const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
420
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
421
+ c1 = pset1<ResPacket>(ResScalar(0)),
422
+ c2 = pset1<ResPacket>(ResScalar(0)),
423
+ c3 = pset1<ResPacket>(ResScalar(0));
462
424
 
463
- Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
464
- for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
465
- {
466
- // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
467
- EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
468
- ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
469
-
470
- // this helps the compiler generating good binary code
471
- const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
472
- lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
425
+ Index j=0;
426
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
427
+ {
428
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
473
429
 
474
- if (Vectorizable)
430
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
431
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
432
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
433
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
434
+ }
435
+ ResScalar cc0 = predux(c0);
436
+ ResScalar cc1 = predux(c1);
437
+ ResScalar cc2 = predux(c2);
438
+ ResScalar cc3 = predux(c3);
439
+ for(; j<cols; ++j)
475
440
  {
476
- /* explicit vectorization */
477
- ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
478
- ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
441
+ RhsScalar b0 = rhs(j,0);
479
442
 
480
- // process initial unaligned coeffs
481
- // FIXME this loop get vectorized by the compiler !
482
- for (Index j=0; j<alignedStart; ++j)
483
- {
484
- RhsScalar b = rhs(j, 0);
485
- tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
486
- tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
487
- }
443
+ cc0 += cj.pmul(lhs(i+0,j), b0);
444
+ cc1 += cj.pmul(lhs(i+1,j), b0);
445
+ cc2 += cj.pmul(lhs(i+2,j), b0);
446
+ cc3 += cj.pmul(lhs(i+3,j), b0);
447
+ }
448
+ res[(i+0)*resIncr] += alpha*cc0;
449
+ res[(i+1)*resIncr] += alpha*cc1;
450
+ res[(i+2)*resIncr] += alpha*cc2;
451
+ res[(i+3)*resIncr] += alpha*cc3;
452
+ }
453
+ for(; i<n2; i+=2)
454
+ {
455
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
456
+ c1 = pset1<ResPacket>(ResScalar(0));
488
457
 
489
- if (alignedSize>alignedStart)
490
- {
491
- switch(alignmentPattern)
492
- {
493
- case AllAligned:
494
- for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
495
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
496
- break;
497
- case EvenAligned:
498
- for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
499
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
500
- break;
501
- case FirstAligned:
502
- {
503
- Index j = alignedStart;
504
- if (peels>1)
505
- {
506
- /* Here we proccess 4 rows with with two peeled iterations to hide
507
- * the overhead of unaligned loads. Moreover unaligned loads are handled
508
- * using special shift/move operations between the two aligned packets
509
- * overlaping the desired unaligned packet. This is *much* more efficient
510
- * than basic unaligned loads.
511
- */
512
- LhsPacket A01, A02, A03, A11, A12, A13;
513
- A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
514
- A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
515
- A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
516
-
517
- for (; j<peeledSize; j+=peels*RhsPacketSize)
518
- {
519
- RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
520
- A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
521
- A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
522
- A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
523
-
524
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
525
- ptmp1 = pcj.pmadd(A01, b, ptmp1);
526
- A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
527
- ptmp2 = pcj.pmadd(A02, b, ptmp2);
528
- A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
529
- ptmp3 = pcj.pmadd(A03, b, ptmp3);
530
- A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
531
-
532
- b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
533
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
534
- ptmp1 = pcj.pmadd(A11, b, ptmp1);
535
- ptmp2 = pcj.pmadd(A12, b, ptmp2);
536
- ptmp3 = pcj.pmadd(A13, b, ptmp3);
537
- }
538
- }
539
- for (; j<alignedSize; j+=RhsPacketSize)
540
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
541
- break;
542
- }
543
- default:
544
- for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
545
- _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
546
- break;
547
- }
548
- tmp0 += predux(ptmp0);
549
- tmp1 += predux(ptmp1);
550
- tmp2 += predux(ptmp2);
551
- tmp3 += predux(ptmp3);
552
- }
553
- } // end explicit vectorization
458
+ Index j=0;
459
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
460
+ {
461
+ RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
554
462
 
555
- // process remaining coeffs (or all if no explicit vectorization)
556
- // FIXME this loop get vectorized by the compiler !
557
- for (Index j=alignedSize; j<depth; ++j)
463
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
464
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
465
+ }
466
+ ResScalar cc0 = predux(c0);
467
+ ResScalar cc1 = predux(c1);
468
+ for(; j<cols; ++j)
558
469
  {
559
- RhsScalar b = rhs(j, 0);
560
- tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
561
- tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
470
+ RhsScalar b0 = rhs(j,0);
471
+
472
+ cc0 += cj.pmul(lhs(i+0,j), b0);
473
+ cc1 += cj.pmul(lhs(i+1,j), b0);
562
474
  }
563
- res[i*resIncr] += alpha*tmp0;
564
- res[(i+offset1)*resIncr] += alpha*tmp1;
565
- res[(i+2)*resIncr] += alpha*tmp2;
566
- res[(i+offset3)*resIncr] += alpha*tmp3;
475
+ res[(i+0)*resIncr] += alpha*cc0;
476
+ res[(i+1)*resIncr] += alpha*cc1;
567
477
  }
568
-
569
- // process remaining first and last rows (at most columnsAtOnce-1)
570
- Index end = rows;
571
- Index start = rowBound;
572
- do
478
+ for(; i<rows; ++i)
573
479
  {
574
- for (Index i=start; i<end; ++i)
480
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
481
+ ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
482
+ ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
483
+ Index j=0;
484
+ for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
575
485
  {
576
- EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
577
- ResPacket ptmp0 = pset1<ResPacket>(tmp0);
578
- const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
579
- // process first unaligned result's coeffs
580
- // FIXME this loop get vectorized by the compiler !
581
- for (Index j=0; j<alignedStart; ++j)
582
- tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
583
-
584
- if (alignedSize>alignedStart)
585
- {
586
- // process aligned rhs coeffs
587
- if (lhs0.template aligned<LhsPacket>(alignedStart))
588
- for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
589
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
590
- else
591
- for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
592
- ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
593
- tmp0 += predux(ptmp0);
594
- }
595
-
596
- // process remaining scalars
597
- // FIXME this loop get vectorized by the compiler !
598
- for (Index j=alignedSize; j<depth; ++j)
599
- tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
600
- res[i*resIncr] += alpha*tmp0;
486
+ RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
487
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
601
488
  }
602
- if (skipRows)
489
+ ResScalar cc0 = predux(c0);
490
+ if (HasHalf) {
491
+ for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
492
+ {
493
+ RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
494
+ c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
495
+ }
496
+ cc0 += predux(c0_h);
497
+ }
498
+ if (HasQuarter) {
499
+ for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
500
+ {
501
+ RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
502
+ c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
503
+ }
504
+ cc0 += predux(c0_q);
505
+ }
506
+ for(; j<cols; ++j)
603
507
  {
604
- start = 0;
605
- end = skipRows;
606
- skipRows = 0;
508
+ cc0 += cj.pmul(lhs(i,j), rhs(j,0));
607
509
  }
608
- else
609
- break;
610
- } while(Vectorizable);
611
-
612
- #undef _EIGEN_ACCUMULATE_PACKETS
510
+ res[i*resIncr] += alpha*cc0;
511
+ }
613
512
  }
614
513
 
615
514
  } // end namespace internal