@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -0,0 +1,901 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
5
+ // Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
6
+ //
7
+ // This Source Code Form is subject to the terms of the Mozilla
8
+ // Public License v. 2.0. If a copy of the MPL was not distributed
9
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
+
11
+ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
12
+ #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
13
+
14
+ // If using dynamic dispatch, set the CPU target.
15
+ #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
16
+ #pragma GCC push_options
17
+ #pragma GCC target("cpu=power10,htm")
18
+ #endif
19
+
20
+ #ifdef __has_builtin
21
+ #if !__has_builtin(__builtin_vsx_assemble_pair)
22
+ #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
23
+ #endif
24
+ #if !__has_builtin(__builtin_vsx_disassemble_pair)
25
+ #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
26
+ #endif
27
+ #endif
28
+
29
+ // IWYU pragma: private
30
+ #include "../../InternalHeaderCheck.h"
31
+
32
+ #include "MatrixProductMMAbfloat16.h"
33
+
34
+ namespace Eigen {
35
+
36
+ namespace internal {
37
+
38
+ #define accColsC (accCols / 2)
39
+
40
+ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); }
41
+
42
+ template <typename DataMapper, typename Packet, bool full>
43
+ EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements,
44
+ __vector_quad* acc) {
45
+ PacketBlock<Packet, 4> result;
46
+ __builtin_mma_disassemble_acc(&result.packet, acc);
47
+
48
+ PacketBlock<Packet, 4> tRes;
49
+ if (full) {
50
+ EIGEN_UNUSED_VARIABLE(elements);
51
+ bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
52
+ bscale<Packet, 4>(tRes, result, alpha);
53
+ bstore<DataMapper, Packet, 4>(tRes, data, i);
54
+ } else {
55
+ bload_partial<DataMapper, Packet, 0, false, 4>(tRes, data, i, elements);
56
+ bscale<Packet, 4>(tRes, result, alpha);
57
+ bstore_partial<DataMapper, Packet, 4>(tRes, data, i, elements);
58
+ }
59
+ }
60
+
61
+ template <typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
62
+ EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal,
63
+ const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal,
64
+ __vector_quad* accImag) {
65
+ constexpr bool full = (accCols2 > accColsC);
66
+ PacketBlock<Packet, 4> resultReal, resultImag;
67
+ __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
68
+ __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
69
+
70
+ PacketBlock<Packetc, 8> tRes;
71
+ bload<DataMapper, Packetc, accColsC, ColMajor, true, 4, full>(tRes, data, i, 0);
72
+
73
+ PacketBlock<Packet, 4> taccReal, taccImag;
74
+ bscalec<Packet, 4, (accCols != accCols2)>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask);
75
+
76
+ PacketBlock<Packetc, 4> acc1, acc2;
77
+ bcouple<Packet, Packetc, 4, full>(taccReal, taccImag, tRes, acc1, acc2);
78
+
79
+ bstore<DataMapper, Packetc, 4>(acc1, data, i);
80
+ if (full) {
81
+ bstore<DataMapper, Packetc, 4>(acc2, data, i + accColsC);
82
+ }
83
+ }
84
+
85
+ // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
86
+ template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
87
+ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
88
+ if (NegativeAccumulate) {
89
+ __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
90
+ } else {
91
+ __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
92
+ }
93
+ }
94
+
95
+ template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
96
+ EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) {
97
+ if (NegativeAccumulate) {
98
+ __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
99
+ } else {
100
+ __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
101
+ }
102
+ }
103
+
104
+ template <typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
105
+ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi,
106
+ const RhsPacket& rhsV, RhsPacket& rhsVi) {
107
+ pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
108
+ if (LhsIsReal) {
109
+ pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
110
+ EIGEN_UNUSED_VARIABLE(lhsVi);
111
+ } else {
112
+ if (!RhsIsReal) {
113
+ pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
114
+ pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
115
+ } else {
116
+ EIGEN_UNUSED_VARIABLE(rhsVi);
117
+ }
118
+ pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag, rhsV, lhsVi);
119
+ }
120
+ }
121
+
122
+ // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
123
+ template <typename Packet>
124
+ EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet) * rhs) {
125
+ return ploadu<Packet>(rhs);
126
+ }
127
+
128
+ template <typename Scalar, typename Packet>
129
+ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) {
130
+ rhsV = ploadRhs<Packet>(rhs);
131
+ }
132
+
133
+ template <>
134
+ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) {
135
+ #if EIGEN_COMP_LLVM
136
+ __builtin_vsx_assemble_pair(
137
+ &rhsV, reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
138
+ reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
139
+ #else
140
+ rhsV = *reinterpret_cast<__vector_pair*>(const_cast<double*>(rhs));
141
+ #endif
142
+ }
143
+
144
+ EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) { ploadRhsMMA(lhs, lhsV); }
145
+
146
+ #define GEMM_MULTIPLE_COLS
147
+
148
+ // Disable in GCC until unnecessary register moves are fixed
149
+ // #if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
150
+ #if EIGEN_COMP_LLVM
151
+ #define VECTOR_PAIR_LOADS_LHS
152
+ #endif
153
+
154
+ // PEEL_MMA loop factor.
155
+ #ifdef GEMM_MULTIPLE_COLS
156
+ #define PEEL_MMA 8
157
+ #else
158
+ // Register spillage with GCC12+
159
+ #if EIGEN_COMP_LLVM || (__GNUC__ < 12) || defined(VECTOR_PAIR_LOADS_LHS)
160
+ #define PEEL_MMA 7
161
+ #else
162
+ #define PEEL_MMA 6
163
+ #endif
164
+ #endif
165
+
166
+ #define MICRO_MMA_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
167
+
168
+ #define MICRO_MMA_WORK(func, type, peel) \
169
+ if (accItr == 1) { \
170
+ func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0) \
171
+ func(4, type, peel, 4, 0) func(5, type, peel, 5, 0) func(6, type, peel, 6, 0) func(7, type, peel, 7, 0) \
172
+ } else if (accItr == 2) { \
173
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1) \
174
+ func(4, type, peel, 2, 0) func(5, type, peel, 2, 1) func(6, type, peel, 3, 0) func(7, type, peel, 3, 1) \
175
+ } else { \
176
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3) \
177
+ func(4, type, peel, 1, 0) func(5, type, peel, 1, 1) func(6, type, peel, 1, 2) func(7, type, peel, 1, 3) \
178
+ }
179
+
180
+ #define MICRO_MMA_WORK_ONE(iter, type, peel, left, right) \
181
+ if (unroll_factor > left) { \
182
+ pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV##left); \
183
+ }
184
+
185
+ #ifdef VECTOR_PAIR_LOADS_LHS
186
+ #define MICRO_MMA_WORK_TWO(iter, type, peel, left, right) \
187
+ if (unroll_factor > left) { \
188
+ pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV2##left.packet[peel & 1]); \
189
+ }
190
+
191
+ #define MICRO_MMA_LOAD1_TWO(lhs_ptr, left) \
192
+ if (unroll_factor > left) { \
193
+ if (MICRO_NORMAL(left)) { \
194
+ ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left); \
195
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##left.packet), &plhsV##left); \
196
+ lhs_ptr##left += accCols * 2; \
197
+ } else { \
198
+ lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left); \
199
+ lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2); \
200
+ lhs_ptr##left += accCols2 * 2; \
201
+ EIGEN_UNUSED_VARIABLE(plhsV##left); \
202
+ } \
203
+ } else { \
204
+ EIGEN_UNUSED_VARIABLE(lhsV2##left); \
205
+ EIGEN_UNUSED_VARIABLE(plhsV##left); \
206
+ }
207
+
208
+ #define MICRO_MMA_LOAD_TWO(left) MICRO_MMA_LOAD1_TWO(lhs_ptr, left)
209
+ #endif
210
+
211
+ #define MICRO_MMA_UNROLL_ITER(func, val) \
212
+ func(val, 0) if (accItr > 1) { \
213
+ func(val, 1) if (accItr > 2) { func(val, 2) func(val, 3) } \
214
+ }
215
+
216
+ #define MICRO_MMA_LOAD_ONE_RHS1(peel, right) ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
217
+
218
+ #define MICRO_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
219
+
220
+ #define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
221
+ if (PEEL_MMA > peel) { \
222
+ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
223
+ MICRO_MMA_LOAD_ONE_RHS(peel) \
224
+ MICRO_MMA_UNROLL(funcl) \
225
+ MICRO_MMA_WORK(funcw, type, peel) \
226
+ }
227
+
228
+ #ifndef VECTOR_PAIR_LOADS_LHS
229
+ #define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
230
+ type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
231
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0) \
232
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 1) \
233
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 2) \
234
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 3) \
235
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 4) \
236
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 5) \
237
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 6) MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 7)
238
+ #else
239
+ #define MICRO_MMA_LOAD_TWO_RHS(peel1, right) \
240
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr##right + (accRows * peel1)), prhsV##peel1); \
241
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);
242
+
243
+ #define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
244
+ if (PEEL_MMA > peel2) { \
245
+ PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
246
+ __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
247
+ if (sizeof(type) == 16) { \
248
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1) \
249
+ } else { \
250
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
251
+ MICRO_MMA_LOAD_ONE_RHS(peel1) \
252
+ MICRO_MMA_LOAD_ONE_RHS(peel2) \
253
+ } \
254
+ MICRO_MMA_UNROLL(funcl2) \
255
+ MICRO_MMA_WORK(funcw2, type, peel1) \
256
+ MICRO_MMA_WORK(funcw2, type, peel2) \
257
+ } else { \
258
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
259
+ MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
260
+ }
261
+
262
+ #define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
263
+ type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
264
+ __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
265
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1) \
266
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3) \
267
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 4, 5) \
268
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 6, 7)
269
+ #endif
270
+
271
+ #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
272
+ type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1]; \
273
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0)
274
+
275
+ #define MICRO_MMA_UPDATE_RHS1(size, right) rhs_ptr##right += (accRows * size);
276
+
277
+ #define MICRO_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
278
+
279
+ #define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
280
+ MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
281
+ MICRO_MMA_UPDATE_RHS(size)
282
+
283
+ #ifndef VECTOR_PAIR_LOADS_LHS
284
+ #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
285
+ #else
286
+ #define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
287
+ MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
288
+ MICRO_MMA_UPDATE_RHS(size)
289
+
290
+ #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
291
+ #endif
292
+
293
+ #define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
294
+
295
+ #define MICRO_MMA_DST_PTR_ONE(iter) \
296
+ if (unroll_factor * accItr > iter) { \
297
+ bsetzeroMMA(&accZero##iter); \
298
+ } else { \
299
+ EIGEN_UNUSED_VARIABLE(accZero##iter); \
300
+ }
301
+
302
+ #define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
303
+
304
+ #define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_SRC_PTR_ONE)
305
+
306
+ #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
307
+
308
+ #define MICRO_MMA_STORE_ONE(iter, left, right) \
309
+ if (unroll_factor > left) { \
310
+ storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left * accCols, res##right, pAlpha, \
311
+ accCols2, &accZero##iter); \
312
+ }
313
+
314
+ #define MICRO_MMA_ITER_UNROLL(func) \
315
+ if (accItr == 1) { \
316
+ func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) func(4, 4, 0) func(5, 5, 0) func(6, 6, 0) func(7, 7, 0) \
317
+ } else if (accItr == 2) { \
318
+ func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) func(4, 2, 0) func(5, 2, 1) func(6, 3, 0) func(7, 3, 1) \
319
+ } else { \
320
+ func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) func(4, 1, 0) func(5, 1, 1) func(6, 1, 2) func(7, 1, 3) \
321
+ }
322
+
323
+ #define MICRO_MMA_STORE MICRO_MMA_ITER_UNROLL(MICRO_MMA_STORE_ONE)
324
+
325
+ #define MICRO_MMA_EXTRA_ROWS(right) \
326
+ gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>( \
327
+ res3##right, blockA, rhs_base + right * accRows * strideB, depth, strideA, offsetA, strideB, row, rows, \
328
+ remaining_rows, pAlpha, pMask);
329
+
330
+ #define MICRO_MMA_EXTRA_ROWS1(val, right) MICRO_MMA_EXTRA_ROWS(right);
331
+
332
+ template <int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper,
333
+ const Index accRows, const Index accCols, bool full, const Index accItr>
334
+ EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
335
+ const DataMapper& res2, const DataMapper& res3,
336
+ const Scalar* lhs_base, const Scalar* rhs_base, Index depth,
337
+ Index strideA, Index strideB, Index offsetA, Index& row,
338
+ const Packet& pAlpha, Index accCols2) {
339
+ const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL, *rhs_ptr3 = NULL;
340
+ const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
341
+ *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
342
+ __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
343
+
344
+ if (accItr > 1) {
345
+ rhs_ptr1 = rhs_base + (accRows * strideB);
346
+ } else {
347
+ EIGEN_UNUSED_VARIABLE(strideB);
348
+ EIGEN_UNUSED_VARIABLE(rhs_ptr1);
349
+ EIGEN_UNUSED_VARIABLE(res1);
350
+ }
351
+ if (accItr > 2) {
352
+ rhs_ptr2 = rhs_base + (2 * accRows * strideB);
353
+ rhs_ptr3 = rhs_base + (3 * accRows * strideB);
354
+ } else {
355
+ EIGEN_UNUSED_VARIABLE(rhs_ptr2);
356
+ EIGEN_UNUSED_VARIABLE(rhs_ptr3);
357
+ EIGEN_UNUSED_VARIABLE(res2);
358
+ EIGEN_UNUSED_VARIABLE(res3);
359
+ }
360
+
361
+ MICRO_MMA_SRC_PTR
362
+ MICRO_MMA_DST_PTR
363
+
364
+ Index k = 0, depth2 = depth - PEEL_MMA;
365
+ for (; k <= depth2; k += PEEL_MMA) {
366
+ EIGEN_POWER_PREFETCH(rhs_ptr);
367
+ MICRO_MMA_PREFETCH
368
+ MICRO_MMA_ONE_PEEL
369
+ }
370
+ for (; k < depth; k++) {
371
+ MICRO_MMA_ONE
372
+ }
373
+ MICRO_MMA_STORE
374
+
375
+ MICRO_UPDATE
376
+ }
377
+
378
+ #define MICRO_MMA_UNROLL_ITER2(N, M) \
379
+ gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>( \
380
+ res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha, \
381
+ M ? remaining_rows : accCols); \
382
+ if (M) return;
383
+
384
+ #define MICRO_MMA_ROWS(n) \
385
+ while (row + n * accCols <= rows) { \
386
+ MICRO_MMA_UNROLL_ITER2(n, 0); \
387
+ }
388
+
389
+ template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
390
+ const Index accCols, const Index accItr>
391
+ EIGEN_ALWAYS_INLINE void gemmMMA_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
392
+ Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
393
+ Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
394
+ const DataMapper res30 = res.getSubMapper(0, col);
395
+ const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
396
+ const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
397
+ const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
398
+
399
+ const Scalar* rhs_base = blockB + col * strideB + accRows * offsetB;
400
+ const Scalar* lhs_base = blockA + accCols * offsetA;
401
+ Index row = 0;
402
+
403
+ #define MAX_MMA_UNROLL 7
404
+
405
+ #if MAX_MMA_UNROLL < 2
406
+ if (1) {
407
+ #elif MAX_MMA_UNROLL < 4
408
+ if (accItr <= 2) {
409
+ #else
410
+ if (accItr == 1) {
411
+ #endif
412
+ MICRO_MMA_ROWS(MAX_MMA_UNROLL);
413
+ } else if (accItr == 2) {
414
+ MICRO_MMA_ROWS(4);
415
+ } else {
416
+ MICRO_MMA_ROWS(2);
417
+ }
418
+ switch ((rows - row) / accCols) {
419
+ #if MAX_MMA_UNROLL > 7
420
+ case 7:
421
+ if (accItr == 1) {
422
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
423
+ }
424
+ break;
425
+ #endif
426
+ #if MAX_MMA_UNROLL > 6
427
+ case 6:
428
+ if (accItr == 1) {
429
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
430
+ }
431
+ break;
432
+ #endif
433
+ #if MAX_MMA_UNROLL > 5
434
+ case 5:
435
+ if (accItr == 1) {
436
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
437
+ }
438
+ break;
439
+ #endif
440
+ #if MAX_MMA_UNROLL > 4
441
+ case 4:
442
+ if (accItr == 1) {
443
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
444
+ }
445
+ break;
446
+ #endif
447
+ #if MAX_MMA_UNROLL > 3
448
+ case 3:
449
+ if (accItr <= 2) {
450
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
451
+ }
452
+ break;
453
+ #endif
454
+ #if MAX_MMA_UNROLL > 2
455
+ case 2:
456
+ if (accItr <= 2) {
457
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
458
+ }
459
+ break;
460
+ #endif
461
+ #if MAX_MMA_UNROLL > 1
462
+ case 1:
463
+ MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 1)
464
+ break;
465
+ #endif
466
+ default:
467
+ break;
468
+ }
469
+ #undef MAX_MMA_UNROLL
470
+
471
+ if (remaining_rows > 0) {
472
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_EXTRA_ROWS1, 0)
473
+ }
474
+ }
475
+
476
+ #define MICRO_MMA_COLS(n) \
477
+ for (; col + n * accRows <= cols; col += n * accRows) { \
478
+ gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>( \
479
+ res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
480
+ }
481
+
482
+ template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
483
+ const Index accCols>
484
+ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
485
+ Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
486
+ const Index remaining_rows = rows % accCols;
487
+
488
+ if (strideA == -1) strideA = depth;
489
+ if (strideB == -1) strideB = depth;
490
+
491
+ const Packet pAlpha = pset1<Packet>(alpha);
492
+ const Packet pMask = bmask<Packet>(remaining_rows);
493
+
494
+ typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
495
+
496
+ Index col = 0;
497
+ #ifdef GEMM_MULTIPLE_COLS
498
+ MICRO_MMA_COLS(4);
499
+ MICRO_MMA_COLS(2);
500
+ #endif
501
+ MICRO_MMA_COLS(1);
502
+
503
+ if (col != cols) {
504
+ gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
505
+ col, rows, cols, remaining_rows, pAlpha, pMask);
506
+ }
507
+ }
508
+
509
+ #define advanceRows ((LhsIsReal) ? 1 : 2)
510
+ #define advanceCols ((RhsIsReal) ? 1 : 2)
511
+
512
+ // PEEL_COMPLEX_MMA loop factor.
513
+ #ifdef GEMM_MULTIPLE_COLS
514
+ #define PEEL_COMPLEX_MMA 4
515
+ #else
516
+ #define PEEL_COMPLEX_MMA 3
517
+ #endif
518
+
519
+ #define MICRO_COMPLEX_MMA_UNROLL(func) func(0) func(1) func(2) func(3)
520
+
521
+ #define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
522
+ if (accItr == 1) { \
523
+ func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0) \
524
+ } else if (accItr == 2) { \
525
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1) \
526
+ } else { \
527
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3) \
528
+ }
529
+
530
+ #define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right) \
531
+ if (unroll_factor > left) { \
532
+ pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
533
+ &accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
534
+ }
535
+
536
+ #ifdef VECTOR_PAIR_LOADS_LHS
537
+ #define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right) \
538
+ if (unroll_factor > left) { \
539
+ pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
540
+ &accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], \
541
+ rhsV##right[peel], rhsVi##right[peel]); \
542
+ }
543
+
544
+ #define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left) \
545
+ if (!LhsIsReal && (unroll_factor > left)) { \
546
+ if (MICRO_NORMAL(left)) { \
547
+ ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left); \
548
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##left.packet), &plhsVi##left); \
549
+ } else { \
550
+ lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2); \
551
+ lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2); \
552
+ EIGEN_UNUSED_VARIABLE(plhsVi##left); \
553
+ } \
554
+ } else { \
555
+ EIGEN_UNUSED_VARIABLE(lhsVi2##left); \
556
+ EIGEN_UNUSED_VARIABLE(plhsVi##left); \
557
+ } \
558
+ MICRO_MMA_LOAD1_TWO(lhs_ptr_real, left)
559
+
560
+ #define MICRO_COMPLEX_MMA_LOAD_TWO(left) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)
561
+ #endif
562
+
563
+ #define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right) \
564
+ ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]); \
565
+ if (!RhsIsReal) { \
566
+ ploadRhsMMA(rhs_ptr_imag##right + (accRows * peel), rhsVi##right[peel]); \
567
+ }
568
+
569
+ #define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
570
+
571
+ #define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
572
+ if (PEEL_COMPLEX_MMA > peel) { \
573
+ Packet lhsV0, lhsV1, lhsV2, lhsV3; \
574
+ Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
575
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
576
+ MICRO_COMPLEX_MMA_UNROLL(funcl) \
577
+ MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
578
+ }
579
+
580
+ #ifndef VECTOR_PAIR_LOADS_LHS
581
+ #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
582
+ type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
583
+ rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
584
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0) \
585
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 1) \
586
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 3)
587
+ #else
588
+ #define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right) \
589
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1); \
590
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1); \
591
+ if (!RhsIsReal) { \
592
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag##right + (accRows * peel1)), prhsVi##peel1); \
593
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1); \
594
+ } else { \
595
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
596
+ }
597
+
598
+ #define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
599
+ if (PEEL_COMPLEX_MMA > peel2) { \
600
+ PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23; \
601
+ PacketBlock<Packet, 2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
602
+ __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
603
+ __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
604
+ if (sizeof(type) == 16) { \
605
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1) \
606
+ } else { \
607
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
608
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
609
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1); \
610
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2); \
611
+ } \
612
+ MICRO_COMPLEX_MMA_UNROLL(funcl2) \
613
+ MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
614
+ MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
615
+ } else { \
616
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
617
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
618
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
619
+ }
620
+
621
+ #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
622
+ type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
623
+ rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
624
+ __vector_pair prhsV0, prhsV2; \
625
+ __vector_pair prhsVi0, prhsVi2; \
626
+ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1) \
627
+ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3)
628
+ #endif
629
+
630
+ #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
631
+ type rhsV0[1], rhsVi0[1], rhsV1[1], rhsVi1[1], rhsV2[1], rhsVi2[1], rhsV3[1], rhsVi3[1]; \
632
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0)
633
+
634
+ #define MICRO_COMPLEX_MMA_UPDATE_RHS1(size, right) \
635
+ rhs_ptr_real##right += (accRows * size); \
636
+ if (!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
637
+
638
+ #define MICRO_COMPLEX_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
639
+
640
+ #define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
641
+ MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
642
+ MICRO_COMPLEX_MMA_UPDATE_RHS(size);
643
+
644
+ #ifndef VECTOR_PAIR_LOADS_LHS
645
+ #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
646
+ #else
647
+ #define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
648
+ MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, \
649
+ MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
650
+ MICRO_COMPLEX_MMA_UPDATE_RHS(size);
651
+
652
+ #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
653
+ #endif
654
+
655
+ #define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
656
+
657
+ #define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
658
+ if (unroll_factor * accItr > iter) { \
659
+ bsetzeroMMA(&accReal##iter); \
660
+ bsetzeroMMA(&accImag##iter); \
661
+ } else { \
662
+ EIGEN_UNUSED_VARIABLE(accReal##iter); \
663
+ EIGEN_UNUSED_VARIABLE(accImag##iter); \
664
+ }
665
+
666
+ #define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
667
+
668
+ #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
669
+
670
+ #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
671
+
672
+ #define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right) \
673
+ if (unroll_factor > left) { \
674
+ storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>( \
675
+ row + left * accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
676
+ }
677
+
678
+ #define MICRO_COMPLEX_MMA_ITER_UNROLL(func) \
679
+ if (accItr == 1) { \
680
+ func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) \
681
+ } else if (accItr == 2) { \
682
+ func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) \
683
+ } else { \
684
+ func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) \
685
+ }
686
+
687
+ #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_ITER_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
688
+
689
+ #define MICRO_COMPLEX_MMA_EXTRA_ROWS(right) \
690
+ gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
691
+ RhsIsReal>(res3##right, blockA, rhs_base + right * accRows * (RhsIsReal ? 1 : 2) * strideB, \
692
+ depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, \
693
+ pAlphaImag, pMask);
694
+
695
+ #define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
696
+
697
+ template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket,
698
+ typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs,
699
+ bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
700
+ EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
701
+ const DataMapper& res2, const DataMapper& res3,
702
+ const Scalar* lhs_base, const Scalar* rhs_base,
703
+ Index depth, Index strideA, Index offsetA, Index strideB,
704
+ Index& row, const Packet& pAlphaReal,
705
+ const Packet& pAlphaImag, const Packet& pMask) {
706
+ const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL, *rhs_ptr_real3 = NULL;
707
+ const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL, *rhs_ptr_imag3 = NULL;
708
+ const Index imag_delta = accCols * strideA;
709
+ const Index imag_delta2 = accCols2 * strideA;
710
+
711
+ if (!RhsIsReal) {
712
+ rhs_ptr_imag0 = rhs_base + accRows * strideB;
713
+ } else {
714
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag0);
715
+ }
716
+ if (accItr > 1) {
717
+ if (!RhsIsReal) {
718
+ rhs_ptr_real1 = rhs_base + (2 * accRows * strideB);
719
+ rhs_ptr_imag1 = rhs_base + (3 * accRows * strideB);
720
+ } else {
721
+ rhs_ptr_real1 = rhs_base + accRows * strideB;
722
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
723
+ }
724
+ } else {
725
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_real1);
726
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
727
+ EIGEN_UNUSED_VARIABLE(res1);
728
+ }
729
+ if (accItr > 2) {
730
+ if (!RhsIsReal) {
731
+ rhs_ptr_real2 = rhs_base + (4 * accRows * strideB);
732
+ rhs_ptr_imag2 = rhs_base + (5 * accRows * strideB);
733
+ rhs_ptr_real3 = rhs_base + (6 * accRows * strideB);
734
+ rhs_ptr_imag3 = rhs_base + (7 * accRows * strideB);
735
+ } else {
736
+ rhs_ptr_real2 = rhs_base + (2 * accRows * strideB);
737
+ rhs_ptr_real3 = rhs_base + (3 * accRows * strideB);
738
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
739
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
740
+ }
741
+ } else {
742
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_real2);
743
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_real3);
744
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
745
+ EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
746
+ EIGEN_UNUSED_VARIABLE(res2);
747
+ EIGEN_UNUSED_VARIABLE(res3);
748
+ }
749
+ const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
750
+ const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
751
+ __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
752
+
753
+ MICRO_COMPLEX_MMA_SRC_PTR
754
+ MICRO_COMPLEX_MMA_DST_PTR
755
+
756
+ Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA;
757
+ for (; k <= depth2; k += PEEL_COMPLEX_MMA) {
758
+ EIGEN_POWER_PREFETCH(rhs_ptr_real);
759
+ if (!RhsIsReal) {
760
+ EIGEN_POWER_PREFETCH(rhs_ptr_imag);
761
+ }
762
+ MICRO_COMPLEX_MMA_PREFETCH
763
+ MICRO_COMPLEX_MMA_ONE_PEEL
764
+ }
765
+ for (; k < depth; k++) {
766
+ MICRO_COMPLEX_MMA_ONE
767
+ }
768
+ MICRO_COMPLEX_MMA_STORE
769
+
770
+ MICRO_COMPLEX_UPDATE
771
+ }
772
+
773
+ #define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
774
+ gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, \
775
+ accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, \
776
+ accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, \
777
+ strideB, row, pAlphaReal, pAlphaImag, pMask); \
778
+ if (M) return;
779
+
780
+ #define MICRO_COMPLEX_MMA_ROWS(n) \
781
+ while (row + n * accCols <= rows) { \
782
+ MICRO_COMPLEX_MMA_UNROLL_ITER2(n, 0); \
783
+ }
784
+
785
+ template <typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper,
786
+ const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
787
+ bool RhsIsReal, const Index accItr>
788
+ EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
789
+ Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
790
+ Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
791
+ const Packet& pAlphaImag, const Packet& pMask) {
792
+ const DataMapper res30 = res.getSubMapper(0, col);
793
+ const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
794
+ const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
795
+ const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
796
+
797
+ const Scalar* rhs_base = blockB + advanceCols * col * strideB + accRows * offsetB;
798
+ const Scalar* lhs_base = blockA + accCols * offsetA;
799
+ Index row = 0;
800
+
801
+ #define MAX_COMPLEX_MMA_UNROLL 4
802
+
803
+ #if MAX_COMPLEX_MMA_UNROLL < 2
804
+ if (1) {
805
+ #elif MAX_COMPLEX_MMA_UNROLL < 4
806
+ if (accItr <= 2) {
807
+ #else
808
+ if (accItr == 1) {
809
+ #endif
810
+ MICRO_COMPLEX_MMA_ROWS(MAX_COMPLEX_MMA_UNROLL);
811
+ } else if (accItr == 2) {
812
+ MICRO_COMPLEX_MMA_ROWS(2);
813
+ } else {
814
+ MICRO_COMPLEX_MMA_ROWS(1);
815
+ }
816
+ switch ((rows - row) / accCols) {
817
+ #if MAX_COMPLEX_MMA_UNROLL > 3
818
+ case 3:
819
+ if (accItr == 1) {
820
+ MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
821
+ }
822
+ break;
823
+ #endif
824
+ #if MAX_COMPLEX_MMA_UNROLL > 2
825
+ case 2:
826
+ if (accItr == 1) {
827
+ MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
828
+ }
829
+ break;
830
+ #endif
831
+ #if MAX_COMPLEX_MMA_UNROLL > 1
832
+ case 1:
833
+ if (accItr <= 2) {
834
+ MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
835
+ }
836
+ break;
837
+ #endif
838
+ default:
839
+ break;
840
+ }
841
+ #undef MAX_COMPLEX_MMA_UNROLL
842
+
843
+ if (remaining_rows > 0) {
844
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_EXTRA_ROWS1, 0)
845
+ }
846
+ }
847
+
848
+ #define MICRO_COMPLEX_MMA_COLS(n) \
849
+ for (; col + n * accRows <= cols; col += n * accRows) { \
850
+ gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, \
851
+ ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, \
852
+ offsetB, col, rows, remaining_rows, pAlphaReal, \
853
+ pAlphaImag, pMask); \
854
+ }
855
+
856
+ template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
857
+ typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
858
+ bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
859
+ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth,
860
+ Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
861
+ const Index remaining_rows = rows % accCols;
862
+
863
+ if (strideA == -1) strideA = depth;
864
+ if (strideB == -1) strideB = depth;
865
+
866
+ const Packet pAlphaReal = pset1<Packet>(alpha.real());
867
+ const Packet pAlphaImag = pset1<Packet>(alpha.imag());
868
+ const Packet pMask = bmask<Packet>(remaining_rows);
869
+
870
+ const Scalar* blockA = (Scalar*)blockAc;
871
+ const Scalar* blockB = (Scalar*)blockBc;
872
+
873
+ typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
874
+
875
+ Index col = 0;
876
+ #ifdef GEMM_MULTIPLE_COLS
877
+ MICRO_COMPLEX_MMA_COLS(4);
878
+ MICRO_COMPLEX_MMA_COLS(2);
879
+ #endif
880
+ MICRO_COMPLEX_MMA_COLS(1);
881
+
882
+ if (col != cols) {
883
+ gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
884
+ RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
885
+ remaining_rows, pAlphaReal, pAlphaImag, pMask);
886
+ }
887
+ }
888
+
889
+ #undef accColsC
890
+ #undef advanceRows
891
+ #undef advanceCols
892
+
893
+ } // end namespace internal
894
+
895
+ } // end namespace Eigen
896
+
897
+ #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
898
+ #pragma GCC pop_options
899
+ #endif
900
+
901
+ #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H