@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -10,342 +10,791 @@
10
10
  #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
11
11
  #define EIGEN_PACKET_MATH_ZVECTOR_H
12
12
 
13
- #include <stdint.h>
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
14
15
 
15
16
  namespace Eigen {
16
17
 
17
18
  namespace internal {
18
19
 
19
20
  #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20
- #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
21
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
21
22
  #endif
22
23
 
23
24
  #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24
25
  #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25
26
  #endif
26
27
 
27
- #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
28
- #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
29
- #endif
30
-
31
28
  #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32
- #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
29
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33
30
  #endif
34
31
 
35
- typedef __vector int Packet4i;
36
- typedef __vector unsigned int Packet4ui;
37
- typedef __vector __bool int Packet4bi;
38
- typedef __vector short int Packet8i;
39
- typedef __vector unsigned char Packet16uc;
40
- typedef __vector double Packet2d;
41
- typedef __vector unsigned long long Packet2ul;
42
- typedef __vector long long Packet2l;
43
-
32
+ typedef __vector int Packet4i;
33
+ typedef __vector unsigned int Packet4ui;
34
+ typedef __vector __bool int Packet4bi;
35
+ typedef __vector short int Packet8i;
36
+ typedef __vector unsigned char Packet16uc;
37
+ typedef __vector double Packet2d;
38
+ typedef __vector unsigned long long Packet2ul;
39
+ typedef __vector long long Packet2l;
40
+
41
+ // Z14 has builtin support for float vectors
42
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
43
+ typedef __vector float Packet4f;
44
+ #else
44
45
  typedef struct {
45
- Packet2d v4f[2];
46
+ Packet2d v4f[2];
46
47
  } Packet4f;
48
+ #endif
47
49
 
48
50
  typedef union {
49
- int32_t i[4];
50
- uint32_t ui[4];
51
- int64_t l[2];
52
- uint64_t ul[2];
53
- double d[2];
54
- Packet4i v4i;
51
+ numext::int32_t i[4];
52
+ numext::uint32_t ui[4];
53
+ numext::int64_t l[2];
54
+ numext::uint64_t ul[2];
55
+ double d[2];
56
+ float f[4];
57
+ Packet4i v4i;
55
58
  Packet4ui v4ui;
56
- Packet2l v2l;
59
+ Packet2l v2l;
57
60
  Packet2ul v2ul;
58
- Packet2d v2d;
61
+ Packet2d v2d;
62
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
63
+ Packet4f v4f;
64
+ #endif
59
65
  } Packet;
60
66
 
61
67
  // We don't want to write the same code all the time, but we need to reuse the constants
62
68
  // and it doesn't really work to declare them global, so we define macros instead
63
69
 
64
- #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
65
- Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
70
+ #define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
66
71
 
67
- #define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
68
- Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
72
+ #define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
69
73
 
70
- #define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
71
- Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
74
+ #define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
72
75
 
73
- #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
74
- Packet4i p4i_##NAME = pset1<Packet4i>(X)
76
+ #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
75
77
 
76
- #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
77
- Packet2d p2d_##NAME = pset1<Packet2d>(X)
78
+ #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
78
79
 
79
- #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
80
- Packet2l p2l_##NAME = pset1<Packet2l>(X)
80
+ #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
81
81
 
82
82
  // These constants are endian-agnostic
83
- //static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
84
- static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
83
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
84
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
85
85
 
86
- static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
87
- static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
88
- static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
86
+ static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
87
+ static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
88
+ static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
89
89
 
90
- static Packet2d p2d_ONE = { 1.0, 1.0 };
91
- static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
90
+ static Packet2d p2d_ONE = {1.0, 1.0};
91
+ static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
92
+ numext::bit_cast<double>(0x8000000000000000ull)};
92
93
 
93
- static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
94
- static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
95
- static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
94
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
95
+ #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
96
96
 
97
- static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
98
- static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
99
-
100
- // Mask alignment
101
- #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
97
+ #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
102
98
 
103
- #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
99
+ #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
100
+ const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
104
101
 
105
- // Handle endianness properly while loading constants
106
- // Define global static constants:
102
+ static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
103
+ static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
104
+ static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
105
+ #endif
107
106
 
108
- static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
109
- static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
110
- static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
107
+ static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
108
+ static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
109
+ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
110
+ vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
111
111
 
112
- static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
113
- static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
114
- /*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
112
+ static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
113
+ static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
115
114
 
116
- static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
117
- static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
118
- /*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
119
- static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
120
- static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
121
- static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
115
+ // Mask alignment
116
+ #define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
122
117
 
123
- //static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
118
+ #define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
124
119
 
125
- //static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
120
+ // Handle endianness properly while loading constants
121
+ // Define global static constants:
126
122
 
123
+ static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
124
+ static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
125
+ static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
126
+
127
+ static Packet16uc p16uc_PSET32_WODD =
128
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
129
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
130
+ static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
132
+ /*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
133
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
134
+
135
+ static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
136
+ (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
137
+ static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
138
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
139
+ /*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
140
+ 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
141
+ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
142
+ static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
143
+ static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
144
+
145
+ static Packet16uc p16uc_COMPLEX32_REV =
146
+ vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
147
+
148
+ static Packet16uc p16uc_COMPLEX32_REV2 =
149
+ vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
127
150
 
128
151
  #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
129
- #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
152
+ #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
130
153
  #else
131
- #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
154
+ #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm(" pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
132
155
  #endif
133
156
 
134
- template<> struct packet_traits<int> : default_packet_traits
135
- {
157
+ template <>
158
+ struct packet_traits<int> : default_packet_traits {
136
159
  typedef Packet4i type;
137
160
  typedef Packet4i half;
138
161
  enum {
139
162
  Vectorizable = 1,
140
163
  AlignedOnScalar = 1,
141
164
  size = 4,
142
- HasHalfPacket = 0,
143
165
 
144
- HasAdd = 1,
145
- HasSub = 1,
146
- HasMul = 1,
147
- HasDiv = 1,
166
+ HasAdd = 1,
167
+ HasSub = 1,
168
+ HasMul = 1,
169
+ HasDiv = 1,
148
170
  HasBlend = 1
149
171
  };
150
172
  };
151
173
 
152
- template<> struct packet_traits<float> : default_packet_traits
153
- {
174
+ template <>
175
+ struct packet_traits<float> : default_packet_traits {
154
176
  typedef Packet4f type;
155
177
  typedef Packet4f half;
156
178
  enum {
157
179
  Vectorizable = 1,
158
180
  AlignedOnScalar = 1,
159
- size=4,
160
- HasHalfPacket = 0,
161
-
162
- HasAdd = 1,
163
- HasSub = 1,
164
- HasMul = 1,
165
- HasDiv = 1,
166
- HasMin = 1,
167
- HasMax = 1,
168
- HasAbs = 1,
169
- HasSin = 0,
170
- HasCos = 0,
171
- HasLog = 0,
172
- HasExp = 1,
181
+ size = 4,
182
+
183
+ HasCmp = 1,
184
+ HasAdd = 1,
185
+ HasSub = 1,
186
+ HasMul = 1,
187
+ HasDiv = 1,
188
+ HasMin = 1,
189
+ HasMax = 1,
190
+ HasAbs = 1,
191
+ HasSin = 0,
192
+ HasCos = 0,
193
+ HasLog = 0,
194
+ HasExp = 1,
173
195
  HasSqrt = 1,
174
196
  HasRsqrt = 1,
175
- HasRound = 1,
176
- HasFloor = 1,
177
- HasCeil = 1,
197
+ HasTanh = 1,
198
+ HasErf = 1,
178
199
  HasNegate = 1,
179
200
  HasBlend = 1
180
201
  };
181
202
  };
182
203
 
183
- template<> struct packet_traits<double> : default_packet_traits
184
- {
204
+ template <>
205
+ struct packet_traits<double> : default_packet_traits {
185
206
  typedef Packet2d type;
186
207
  typedef Packet2d half;
187
208
  enum {
188
209
  Vectorizable = 1,
189
210
  AlignedOnScalar = 1,
190
- size=2,
191
- HasHalfPacket = 1,
192
-
193
- HasAdd = 1,
194
- HasSub = 1,
195
- HasMul = 1,
196
- HasDiv = 1,
197
- HasMin = 1,
198
- HasMax = 1,
199
- HasAbs = 1,
200
- HasSin = 0,
201
- HasCos = 0,
202
- HasLog = 0,
203
- HasExp = 1,
211
+ size = 2,
212
+
213
+ HasAdd = 1,
214
+ HasSub = 1,
215
+ HasMul = 1,
216
+ HasDiv = 1,
217
+ HasMin = 1,
218
+ HasMax = 1,
219
+ HasAbs = 1,
220
+ HasSin = 0,
221
+ HasCos = 0,
222
+ HasLog = 0,
223
+ HasExp = 1,
204
224
  HasSqrt = 1,
205
225
  HasRsqrt = 1,
206
- HasRound = 1,
207
- HasFloor = 1,
208
- HasCeil = 1,
209
226
  HasNegate = 1,
210
227
  HasBlend = 1
211
228
  };
212
229
  };
213
230
 
214
- template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
215
- template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
216
- template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
231
+ template <>
232
+ struct unpacket_traits<Packet4i> {
233
+ typedef int type;
234
+ enum {
235
+ size = 4,
236
+ alignment = Aligned16,
237
+ vectorizable = true,
238
+ masked_load_available = false,
239
+ masked_store_available = false
240
+ };
241
+ typedef Packet4i half;
242
+ };
243
+ template <>
244
+ struct unpacket_traits<Packet4f> {
245
+ typedef float type;
246
+ enum {
247
+ size = 4,
248
+ alignment = Aligned16,
249
+ vectorizable = true,
250
+ masked_load_available = false,
251
+ masked_store_available = false
252
+ };
253
+ typedef Packet4f half;
254
+ typedef Packet4i integer_packet;
255
+ };
256
+ template <>
257
+ struct unpacket_traits<Packet2d> {
258
+ typedef double type;
259
+ enum {
260
+ size = 2,
261
+ alignment = Aligned16,
262
+ vectorizable = true,
263
+ masked_load_available = false,
264
+ masked_store_available = false
265
+ };
266
+ typedef Packet2d half;
267
+ typedef Packet2l integer_packet;
268
+ };
217
269
 
218
270
  /* Forward declaration */
219
- EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
220
-
221
- inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
222
- {
271
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
272
+
273
+ inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
223
274
  Packet vt;
224
275
  vt.v4i = v;
225
276
  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
226
277
  return s;
227
278
  }
228
279
 
229
- inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
230
- {
280
+ inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
231
281
  Packet vt;
232
282
  vt.v4ui = v;
233
283
  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
234
284
  return s;
235
285
  }
236
286
 
237
- inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
238
- {
287
+ inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
239
288
  Packet vt;
240
289
  vt.v2l = v;
241
290
  s << vt.l[0] << ", " << vt.l[1];
242
291
  return s;
243
292
  }
244
293
 
245
- inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
246
- {
294
+ inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
247
295
  Packet vt;
248
296
  vt.v2ul = v;
249
- s << vt.ul[0] << ", " << vt.ul[1] ;
297
+ s << vt.ul[0] << ", " << vt.ul[1];
250
298
  return s;
251
299
  }
252
300
 
253
- inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
254
- {
301
+ inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
255
302
  Packet vt;
256
303
  vt.v2d = v;
257
304
  s << vt.d[0] << ", " << vt.d[1];
258
305
  return s;
259
306
  }
260
307
 
261
- /* Helper function to simulate a vec_splat_packet4f
262
- */
263
- template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
264
- {
265
- Packet4f splat;
266
- switch (element) {
267
- case 0:
268
- splat.v4f[0] = vec_splat(from.v4f[0], 0);
269
- splat.v4f[1] = splat.v4f[0];
270
- break;
271
- case 1:
272
- splat.v4f[0] = vec_splat(from.v4f[0], 1);
273
- splat.v4f[1] = splat.v4f[0];
274
- break;
275
- case 2:
276
- splat.v4f[0] = vec_splat(from.v4f[1], 0);
277
- splat.v4f[1] = splat.v4f[0];
278
- break;
279
- case 3:
280
- splat.v4f[0] = vec_splat(from.v4f[1], 1);
281
- splat.v4f[1] = splat.v4f[0];
282
- break;
283
- }
284
- return splat;
308
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
309
+ inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
310
+ Packet vt;
311
+ vt.v4f = v;
312
+ s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
313
+ return s;
285
314
  }
315
+ #endif
286
316
 
287
- template<int Offset>
288
- struct palign_impl<Offset,Packet4i>
289
- {
290
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
291
- {
292
- switch (Offset % 4) {
293
- case 1:
294
- first = vec_sld(first, second, 4); break;
295
- case 2:
296
- first = vec_sld(first, second, 8); break;
297
- case 3:
298
- first = vec_sld(first, second, 12); break;
299
- }
300
- }
301
- };
317
+ template <>
318
+ EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
319
+ EIGEN_DEBUG_ALIGNED_LOAD
320
+ return vec_xl(0, from);
321
+ }
322
+
323
+ template <>
324
+ EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
325
+ EIGEN_DEBUG_ALIGNED_LOAD
326
+ return vec_xl(0, from);
327
+ }
328
+
329
+ template <>
330
+ EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
331
+ EIGEN_DEBUG_ALIGNED_STORE
332
+ vec_xst(from, 0, to);
333
+ }
334
+
335
+ template <>
336
+ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
337
+ EIGEN_DEBUG_ALIGNED_STORE
338
+ vec_xst(from, 0, to);
339
+ }
340
+
341
+ template <>
342
+ EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
343
+ return pfrexp_generic(a, exponent);
344
+ }
345
+
346
+ template <>
347
+ EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
348
+ return pfrexp_generic(a, exponent);
349
+ }
302
350
 
303
- /* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
351
+ template <>
352
+ EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
353
+ return vec_splats(from);
354
+ }
355
+ template <>
356
+ EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
357
+ return vec_splats(from);
358
+ }
359
+
360
+ template <>
361
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
362
+ a3 = pload<Packet4i>(a);
363
+ a0 = vec_splat(a3, 0);
364
+ a1 = vec_splat(a3, 1);
365
+ a2 = vec_splat(a3, 2);
366
+ a3 = vec_splat(a3, 3);
367
+ }
368
+
369
+ template <>
370
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
371
+ Packet2d& a3) {
372
+ a1 = pload<Packet2d>(a);
373
+ a0 = vec_splat(a1, 0);
374
+ a1 = vec_splat(a1, 1);
375
+ a3 = pload<Packet2d>(a + 2);
376
+ a2 = vec_splat(a3, 0);
377
+ a3 = vec_splat(a3, 1);
378
+ }
379
+
380
+ template <>
381
+ EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
382
+ EIGEN_ALIGN16 int ai[4];
383
+ ai[0] = from[0 * stride];
384
+ ai[1] = from[1 * stride];
385
+ ai[2] = from[2 * stride];
386
+ ai[3] = from[3 * stride];
387
+ return pload<Packet4i>(ai);
388
+ }
389
+
390
+ template <>
391
+ EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
392
+ EIGEN_ALIGN16 double af[2];
393
+ af[0] = from[0 * stride];
394
+ af[1] = from[1 * stride];
395
+ return pload<Packet2d>(af);
396
+ }
397
+
398
+ template <>
399
+ EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
400
+ EIGEN_ALIGN16 int ai[4];
401
+ pstore<int>((int*)ai, from);
402
+ to[0 * stride] = ai[0];
403
+ to[1 * stride] = ai[1];
404
+ to[2 * stride] = ai[2];
405
+ to[3 * stride] = ai[3];
406
+ }
407
+
408
+ template <>
409
+ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
410
+ EIGEN_ALIGN16 double af[2];
411
+ pstore<double>(af, from);
412
+ to[0 * stride] = af[0];
413
+ to[1 * stride] = af[1];
414
+ }
415
+
416
+ template <>
417
+ EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
418
+ return (a + b);
419
+ }
420
+ template <>
421
+ EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
422
+ return (a + b);
423
+ }
424
+
425
+ template <>
426
+ EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
427
+ return (a - b);
428
+ }
429
+ template <>
430
+ EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
431
+ return (a - b);
432
+ }
433
+
434
+ template <>
435
+ EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
436
+ return (a * b);
437
+ }
438
+ template <>
439
+ EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
440
+ return (a * b);
441
+ }
442
+
443
+ template <>
444
+ EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
445
+ return (a / b);
446
+ }
447
+ template <>
448
+ EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
449
+ return (a / b);
450
+ }
451
+
452
+ template <>
453
+ EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
454
+ return (-a);
455
+ }
456
+ template <>
457
+ EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
458
+ return (-a);
459
+ }
460
+
461
+ template <>
462
+ EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
463
+ return a;
464
+ }
465
+ template <>
466
+ EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
467
+ return a;
468
+ }
469
+
470
+ template <>
471
+ EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
472
+ return padd<Packet4i>(pmul<Packet4i>(a, b), c);
473
+ }
474
+ template <>
475
+ EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
476
+ return vec_madd(a, b, c);
477
+ }
478
+
479
+ template <>
480
+ EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
481
+ return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
482
+ }
483
+ template <>
484
+ EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
485
+ return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
486
+ }
487
+
488
+ template <>
489
+ EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
490
+ return vec_min(a, b);
491
+ }
492
+ template <>
493
+ EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
494
+ return vec_min(a, b);
495
+ }
496
+
497
+ template <>
498
+ EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
499
+ return vec_max(a, b);
500
+ }
501
+ template <>
502
+ EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
503
+ return vec_max(a, b);
504
+ }
505
+
506
+ template <>
507
+ EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
508
+ return vec_and(a, b);
509
+ }
510
+ template <>
511
+ EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
512
+ return vec_and(a, b);
513
+ }
514
+
515
+ template <>
516
+ EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
517
+ return vec_or(a, b);
518
+ }
519
+ template <>
520
+ EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
521
+ return vec_or(a, b);
522
+ }
523
+
524
+ template <>
525
+ EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
526
+ return vec_xor(a, b);
527
+ }
528
+ template <>
529
+ EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
530
+ return vec_xor(a, b);
531
+ }
532
+
533
+ template <>
534
+ EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
535
+ return pand<Packet4i>(a, vec_nor(b, b));
536
+ }
537
+ template <>
538
+ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
539
+ return vec_and(a, vec_nor(b, b));
540
+ }
541
+
542
+ template <>
543
+ EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
544
+ /* Uses non-default rounding for vec_round */
545
+ return __builtin_s390_vfidb(a, 0, 1);
546
+ }
547
+ template <>
548
+ EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
549
+ return vec_ceil(a);
550
+ }
551
+ template <>
552
+ EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
553
+ return vec_floor(a);
554
+ }
555
+
556
+ template <>
557
+ EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
558
+ return pload<Packet4i>(from);
559
+ }
560
+ template <>
561
+ EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
562
+ return pload<Packet2d>(from);
563
+ }
564
+
565
+ template <>
566
+ EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
567
+ Packet4i p = pload<Packet4i>(from);
568
+ return vec_perm(p, p, p16uc_DUPLICATE32_HI);
569
+ }
570
+
571
+ template <>
572
+ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
573
+ Packet2d p = pload<Packet2d>(from);
574
+ return vec_perm(p, p, p16uc_PSET64_HI);
575
+ }
576
+
577
+ template <>
578
+ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
579
+ pstore<int>(to, from);
580
+ }
581
+ template <>
582
+ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
583
+ pstore<double>(to, from);
584
+ }
585
+
586
+ template <>
587
+ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
588
+ EIGEN_ZVECTOR_PREFETCH(addr);
589
+ }
590
+ template <>
591
+ EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
592
+ EIGEN_ZVECTOR_PREFETCH(addr);
593
+ }
594
+
595
+ template <int N>
596
+ EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
597
+ return Packet2l { parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]) };
598
+ }
599
+ template <int N>
600
+ EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
601
+ return Packet4i {
602
+ parithmetic_shift_right<N>(a[0]),
603
+ parithmetic_shift_right<N>(a[1]),
604
+ parithmetic_shift_right<N>(a[2]),
605
+ parithmetic_shift_right<N>(a[3]) };
606
+ }
607
+
608
+ template <int N>
609
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
610
+ return Packet2l { plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]) };
611
+ }
612
+ template <int N>
613
+ EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
614
+ return Packet4i {
615
+ plogical_shift_right<N>(a[0]),
616
+ plogical_shift_right<N>(a[1]),
617
+ plogical_shift_right<N>(a[2]),
618
+ plogical_shift_right<N>(a[3]) };
619
+ }
620
+
621
+ template <int N>
622
+ EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
623
+ return Packet2l { plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]) };
624
+ }
625
+ template <int N>
626
+ EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
627
+ return Packet4i {
628
+ plogical_shift_left<N>(a[0]),
629
+ plogical_shift_left<N>(a[1]),
630
+ plogical_shift_left<N>(a[2]),
631
+ plogical_shift_left<N>(a[3]) };
632
+ }
633
+
634
+ template <>
635
+ EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
636
+ EIGEN_ALIGN16 int x[4];
637
+ pstore(x, a);
638
+ return x[0];
639
+ }
640
+ template <>
641
+ EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
642
+ EIGEN_ALIGN16 double x[2];
643
+ pstore(x, a);
644
+ return x[0];
645
+ }
646
+
647
+ template <>
648
+ EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
649
+ return reinterpret_cast<Packet4i>(
650
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
651
+ }
652
+
653
+ template <>
654
+ EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
655
+ return reinterpret_cast<Packet2d>(
656
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
657
+ }
658
+
659
+ template <>
660
+ EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
661
+ return vec_abs(a);
662
+ }
663
+ template <>
664
+ EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
665
+ return vec_abs(a);
666
+ }
667
+
668
+ template <>
669
+ EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
670
+ Packet4i b, sum;
671
+ b = vec_sld(a, a, 8);
672
+ sum = padd<Packet4i>(a, b);
673
+ b = vec_sld(sum, sum, 4);
674
+ sum = padd<Packet4i>(sum, b);
675
+ return pfirst(sum);
676
+ }
677
+
678
+ template <>
679
+ EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
680
+ Packet2d b, sum;
681
+ b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
682
+ sum = padd<Packet2d>(a, b);
683
+ return pfirst(sum);
684
+ }
685
+
686
+ // Other reduction functions:
687
+ // mul
688
+ template <>
689
+ EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
690
+ EIGEN_ALIGN16 int aux[4];
691
+ pstore(aux, a);
692
+ return aux[0] * aux[1] * aux[2] * aux[3];
693
+ }
694
+
695
+ template <>
696
+ EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
697
+ return pfirst(
698
+ pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
699
+ }
700
+
701
+ // min
702
+ template <>
703
+ EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
704
+ Packet4i b, res;
705
+ b = pmin<Packet4i>(a, vec_sld(a, a, 8));
706
+ res = pmin<Packet4i>(b, vec_sld(b, b, 4));
707
+ return pfirst(res);
708
+ }
709
+
710
+ template <>
711
+ EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
712
+ return pfirst(pmin<Packet2d>(
713
+ a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
714
+ }
715
+
716
+ // max
717
+ template <>
718
+ EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
719
+ Packet4i b, res;
720
+ b = pmax<Packet4i>(a, vec_sld(a, a, 8));
721
+ res = pmax<Packet4i>(b, vec_sld(b, b, 4));
722
+ return pfirst(res);
723
+ }
724
+
725
+ // max
726
+ template <>
727
+ EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
728
+ return pfirst(pmax<Packet2d>(
729
+ a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
730
+ }
731
+
732
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
733
+ Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
734
+ Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
735
+ Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
736
+ Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
737
+ kernel.packet[0] = vec_mergeh(t0, t2);
738
+ kernel.packet[1] = vec_mergel(t0, t2);
739
+ kernel.packet[2] = vec_mergeh(t1, t3);
740
+ kernel.packet[3] = vec_mergel(t1, t3);
741
+ }
742
+
743
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
744
+ Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
745
+ Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
746
+ kernel.packet[0] = t0;
747
+ kernel.packet[1] = t1;
748
+ }
749
+
750
+ template <>
751
+ EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
752
+ const Packet4i& elsePacket) {
753
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
754
+ Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
755
+ return vec_sel(elsePacket, thenPacket, mask);
756
+ }
757
+
758
+ template <>
759
+ EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
760
+ const Packet2d& elsePacket) {
761
+ Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
762
+ Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
763
+ return vec_sel(elsePacket, thenPacket, mask);
764
+ }
765
+
766
+ /* z13 has no vector float support so we emulate that with double
767
+ z14 has proper vector float support.
768
+ */
769
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
770
+ /* Helper function to simulate a vec_splat_packet4f
304
771
  */
305
- template<int Offset>
306
- struct palign_impl<Offset,Packet4f>
307
- {
308
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
309
- {
310
- switch (Offset % 4) {
772
+ template <int element>
773
+ EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
774
+ Packet4f splat;
775
+ switch (element) {
776
+ case 0:
777
+ splat.v4f[0] = vec_splat(from.v4f[0], 0);
778
+ splat.v4f[1] = splat.v4f[0];
779
+ break;
311
780
  case 1:
312
- first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
313
- first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
781
+ splat.v4f[0] = vec_splat(from.v4f[0], 1);
782
+ splat.v4f[1] = splat.v4f[0];
314
783
  break;
315
784
  case 2:
316
- first.v4f[0] = first.v4f[1];
317
- first.v4f[1] = second.v4f[0];
785
+ splat.v4f[0] = vec_splat(from.v4f[1], 0);
786
+ splat.v4f[1] = splat.v4f[0];
318
787
  break;
319
788
  case 3:
320
- first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
321
- first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
789
+ splat.v4f[0] = vec_splat(from.v4f[1], 1);
790
+ splat.v4f[1] = splat.v4f[0];
322
791
  break;
323
- }
324
- }
325
- };
326
-
327
-
328
- template<int Offset>
329
- struct palign_impl<Offset,Packet2d>
330
- {
331
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
332
- {
333
- if (Offset == 1)
334
- first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
335
792
  }
336
- };
337
-
338
- template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
339
- {
340
- // FIXME: No intrinsic yet
341
- EIGEN_DEBUG_ALIGNED_LOAD
342
- Packet *vfrom;
343
- vfrom = (Packet *) from;
344
- return vfrom->v4i;
793
+ return splat;
345
794
  }
346
795
 
347
- template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
348
- {
796
+ template <>
797
+ EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
349
798
  // FIXME: No intrinsic yet
350
799
  EIGEN_DEBUG_ALIGNED_LOAD
351
800
  Packet4f vfrom;
@@ -354,72 +803,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
354
803
  return vfrom;
355
804
  }
356
805
 
357
- template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
358
- {
359
- // FIXME: No intrinsic yet
360
- EIGEN_DEBUG_ALIGNED_LOAD
361
- Packet *vfrom;
362
- vfrom = (Packet *) from;
363
- return vfrom->v2d;
364
- }
365
-
366
- template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
367
- {
368
- // FIXME: No intrinsic yet
369
- EIGEN_DEBUG_ALIGNED_STORE
370
- Packet *vto;
371
- vto = (Packet *) to;
372
- vto->v4i = from;
373
- }
374
-
375
- template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
376
- {
806
+ template <>
807
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
377
808
  // FIXME: No intrinsic yet
378
809
  EIGEN_DEBUG_ALIGNED_STORE
379
810
  vec_st2f(from.v4f[0], &to[0]);
380
811
  vec_st2f(from.v4f[1], &to[2]);
381
812
  }
382
813
 
383
-
384
- template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
385
- {
386
- // FIXME: No intrinsic yet
387
- EIGEN_DEBUG_ALIGNED_STORE
388
- Packet *vto;
389
- vto = (Packet *) to;
390
- vto->v2d = from;
391
- }
392
-
393
- template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
394
- {
395
- return vec_splats(from);
396
- }
397
- template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
398
- return vec_splats(from);
399
- }
400
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
401
- {
814
+ template <>
815
+ EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
402
816
  Packet4f to;
403
817
  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
404
818
  to.v4f[1] = to.v4f[0];
405
819
  return to;
406
820
  }
407
821
 
408
- template<> EIGEN_STRONG_INLINE void
409
- pbroadcast4<Packet4i>(const int *a,
410
- Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
411
- {
412
- a3 = pload<Packet4i>(a);
413
- a0 = vec_splat(a3, 0);
414
- a1 = vec_splat(a3, 1);
415
- a2 = vec_splat(a3, 2);
416
- a3 = vec_splat(a3, 3);
417
- }
418
-
419
- template<> EIGEN_STRONG_INLINE void
420
- pbroadcast4<Packet4f>(const float *a,
421
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
422
- {
822
+ template <>
823
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
423
824
  a3 = pload<Packet4f>(a);
424
825
  a0 = vec_splat_packet4f<0>(a3);
425
826
  a1 = vec_splat_packet4f<1>(a3);
@@ -427,461 +828,213 @@ pbroadcast4<Packet4f>(const float *a,
427
828
  a3 = vec_splat_packet4f<3>(a3);
428
829
  }
429
830
 
430
- template<> EIGEN_STRONG_INLINE void
431
- pbroadcast4<Packet2d>(const double *a,
432
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
433
- {
434
- a1 = pload<Packet2d>(a);
435
- a0 = vec_splat(a1, 0);
436
- a1 = vec_splat(a1, 1);
437
- a3 = pload<Packet2d>(a+2);
438
- a2 = vec_splat(a3, 0);
439
- a3 = vec_splat(a3, 1);
831
+ template <>
832
+ EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
833
+ EIGEN_ALIGN16 float ai[4];
834
+ ai[0] = from[0 * stride];
835
+ ai[1] = from[1 * stride];
836
+ ai[2] = from[2 * stride];
837
+ ai[3] = from[3 * stride];
838
+ return pload<Packet4f>(ai);
440
839
  }
441
840
 
442
- template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
443
- {
444
- int EIGEN_ALIGN16 ai[4];
445
- ai[0] = from[0*stride];
446
- ai[1] = from[1*stride];
447
- ai[2] = from[2*stride];
448
- ai[3] = from[3*stride];
449
- return pload<Packet4i>(ai);
450
- }
451
-
452
- template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
453
- {
454
- float EIGEN_ALIGN16 ai[4];
455
- ai[0] = from[0*stride];
456
- ai[1] = from[1*stride];
457
- ai[2] = from[2*stride];
458
- ai[3] = from[3*stride];
459
- return pload<Packet4f>(ai);
460
- }
461
-
462
- template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
463
- {
464
- double EIGEN_ALIGN16 af[2];
465
- af[0] = from[0*stride];
466
- af[1] = from[1*stride];
467
- return pload<Packet2d>(af);
468
- }
469
-
470
- template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
471
- {
472
- int EIGEN_ALIGN16 ai[4];
473
- pstore<int>((int *)ai, from);
474
- to[0*stride] = ai[0];
475
- to[1*stride] = ai[1];
476
- to[2*stride] = ai[2];
477
- to[3*stride] = ai[3];
478
- }
479
-
480
- template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
481
- {
482
- float EIGEN_ALIGN16 ai[4];
483
- pstore<float>((float *)ai, from);
484
- to[0*stride] = ai[0];
485
- to[1*stride] = ai[1];
486
- to[2*stride] = ai[2];
487
- to[3*stride] = ai[3];
488
- }
489
-
490
- template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
491
- {
492
- double EIGEN_ALIGN16 af[2];
493
- pstore<double>(af, from);
494
- to[0*stride] = af[0];
495
- to[1*stride] = af[1];
841
+ template <>
842
+ EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
843
+ EIGEN_ALIGN16 float ai[4];
844
+ pstore<float>((float*)ai, from);
845
+ to[0 * stride] = ai[0];
846
+ to[1 * stride] = ai[1];
847
+ to[2 * stride] = ai[2];
848
+ to[3 * stride] = ai[3];
496
849
  }
497
850
 
498
- template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
499
- template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
500
- {
851
+ template <>
852
+ EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
501
853
  Packet4f c;
502
854
  c.v4f[0] = a.v4f[0] + b.v4f[0];
503
855
  c.v4f[1] = a.v4f[1] + b.v4f[1];
504
856
  return c;
505
857
  }
506
- template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
507
858
 
508
- template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
509
- template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
510
- {
859
+ template <>
860
+ EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
511
861
  Packet4f c;
512
862
  c.v4f[0] = a.v4f[0] - b.v4f[0];
513
863
  c.v4f[1] = a.v4f[1] - b.v4f[1];
514
864
  return c;
515
865
  }
516
- template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
517
866
 
518
- template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
519
- template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
520
- {
867
+ template <>
868
+ EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
521
869
  Packet4f c;
522
870
  c.v4f[0] = a.v4f[0] * b.v4f[0];
523
871
  c.v4f[1] = a.v4f[1] * b.v4f[1];
524
872
  return c;
525
873
  }
526
- template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
527
874
 
528
- template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
529
- template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
530
- {
875
+ template <>
876
+ EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
531
877
  Packet4f c;
532
878
  c.v4f[0] = a.v4f[0] / b.v4f[0];
533
879
  c.v4f[1] = a.v4f[1] / b.v4f[1];
534
880
  return c;
535
881
  }
536
- template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
537
882
 
538
- template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
539
- template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
540
- {
883
+ template <>
884
+ EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
541
885
  Packet4f c;
542
886
  c.v4f[0] = -a.v4f[0];
543
887
  c.v4f[1] = -a.v4f[1];
544
888
  return c;
545
889
  }
546
- template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
547
890
 
548
- template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
549
- template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
550
- template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
551
-
552
- template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
553
- template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
554
- {
891
+ template <>
892
+ EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
555
893
  Packet4f res;
556
894
  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
557
895
  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
558
896
  return res;
559
897
  }
560
- template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
561
-
562
- template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
563
- template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
564
- template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
565
898
 
566
- template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
567
- template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
568
- template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
569
- {
899
+ template <>
900
+ EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
570
901
  Packet4f res;
571
902
  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
572
903
  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
573
904
  return res;
574
905
  }
575
906
 
576
- template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
577
- template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
578
- template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
579
- {
907
+ template <>
908
+ EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
580
909
  Packet4f res;
581
910
  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
582
911
  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
583
912
  return res;
584
913
  }
585
914
 
586
- template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
587
- template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
588
- template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
589
- {
915
+ template <>
916
+ EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
590
917
  Packet4f res;
591
918
  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
592
919
  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
593
920
  return res;
594
921
  }
595
922
 
596
- template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
597
- template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
598
- template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
599
- {
923
+ template <>
924
+ EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
600
925
  Packet4f res;
601
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
602
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
926
+ res.v4f[0] = por(a.v4f[0], b.v4f[0]);
927
+ res.v4f[1] = por(a.v4f[1], b.v4f[1]);
603
928
  return res;
604
929
  }
605
930
 
606
- template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
607
- template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
608
- template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
609
- {
931
+ template <>
932
+ EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
610
933
  Packet4f res;
611
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
612
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
934
+ res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
935
+ res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
613
936
  return res;
614
937
  }
615
938
 
616
- template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
617
- template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
618
- template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
619
- {
939
+ template <>
940
+ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
620
941
  Packet4f res;
621
942
  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
622
943
  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
623
944
  return res;
624
945
  }
625
946
 
626
- template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
627
- {
947
+ template <>
948
+ EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
628
949
  Packet4f res;
629
- res.v4f[0] = vec_round(a.v4f[0]);
630
- res.v4f[1] = vec_round(a.v4f[1]);
950
+ res.v4f[0] = generic_round(a.v4f[0]);
951
+ res.v4f[1] = generic_round(a.v4f[1]);
631
952
  return res;
632
953
  }
633
- template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
634
- template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
635
- {
954
+
955
+ template <>
956
+ EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
636
957
  Packet4f res;
637
958
  res.v4f[0] = vec_ceil(a.v4f[0]);
638
959
  res.v4f[1] = vec_ceil(a.v4f[1]);
639
960
  return res;
640
961
  }
641
- template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
642
- template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
643
- {
962
+
963
+ template <>
964
+ EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
644
965
  Packet4f res;
645
966
  res.v4f[0] = vec_floor(a.v4f[0]);
646
967
  res.v4f[1] = vec_floor(a.v4f[1]);
647
968
  return res;
648
969
  }
649
- template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
650
-
651
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
652
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); }
653
- template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
654
-
655
-
656
- template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
657
- {
658
- Packet4i p = pload<Packet4i>(from);
659
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
660
- }
661
970
 
662
- template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
663
- {
971
+ template <>
972
+ EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
664
973
  Packet4f p = pload<Packet4f>(from);
665
974
  p.v4f[1] = vec_splat(p.v4f[0], 1);
666
975
  p.v4f[0] = vec_splat(p.v4f[0], 0);
667
976
  return p;
668
977
  }
669
978
 
670
- template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
671
- {
672
- Packet2d p = pload<Packet2d>(from);
673
- return vec_perm(p, p, p16uc_PSET64_HI);
979
+ template <>
980
+ EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
981
+ EIGEN_ALIGN16 float x[2];
982
+ vec_st2f(a.v4f[0], &x[0]);
983
+ return x[0];
674
984
  }
675
985
 
676
- template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
677
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
678
- template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
679
-
680
- template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
681
- template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
682
- template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
683
-
684
- template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
685
- template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
686
- template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
687
-
688
- template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
689
- {
690
- return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
691
- }
692
-
693
- template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
694
- {
695
- return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
696
- }
697
-
698
- template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
699
- {
986
+ template <>
987
+ EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
700
988
  Packet4f rev;
701
989
  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
702
990
  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
703
991
  return rev;
704
992
  }
705
993
 
706
- template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
707
- template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
708
- template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
709
- {
994
+ template <>
995
+ EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
710
996
  Packet4f res;
711
997
  res.v4f[0] = pabs(a.v4f[0]);
712
998
  res.v4f[1] = pabs(a.v4f[1]);
713
999
  return res;
714
1000
  }
715
1001
 
716
- template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
717
- {
718
- Packet4i b, sum;
719
- b = vec_sld(a, a, 8);
720
- sum = padd<Packet4i>(a, b);
721
- b = vec_sld(sum, sum, 4);
722
- sum = padd<Packet4i>(sum, b);
723
- return pfirst(sum);
724
- }
725
-
726
- template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
727
- {
728
- Packet2d b, sum;
729
- b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
730
- sum = padd<Packet2d>(a, b);
731
- return pfirst(sum);
732
- }
733
- template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
734
- {
1002
+ template <>
1003
+ EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
735
1004
  Packet2d sum;
736
1005
  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
737
1006
  double first = predux<Packet2d>(sum);
738
1007
  return static_cast<float>(first);
739
1008
  }
740
1009
 
741
- template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
742
- {
743
- Packet4i v[4], sum[4];
744
-
745
- // It's easier and faster to transpose then add as columns
746
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
747
- // Do the transpose, first set of moves
748
- v[0] = vec_mergeh(vecs[0], vecs[2]);
749
- v[1] = vec_mergel(vecs[0], vecs[2]);
750
- v[2] = vec_mergeh(vecs[1], vecs[3]);
751
- v[3] = vec_mergel(vecs[1], vecs[3]);
752
- // Get the resulting vectors
753
- sum[0] = vec_mergeh(v[0], v[2]);
754
- sum[1] = vec_mergel(v[0], v[2]);
755
- sum[2] = vec_mergeh(v[1], v[3]);
756
- sum[3] = vec_mergel(v[1], v[3]);
757
-
758
- // Now do the summation:
759
- // Lines 0+1
760
- sum[0] = padd<Packet4i>(sum[0], sum[1]);
761
- // Lines 2+3
762
- sum[1] = padd<Packet4i>(sum[2], sum[3]);
763
- // Add the results
764
- sum[0] = padd<Packet4i>(sum[0], sum[1]);
765
-
766
- return sum[0];
767
- }
768
-
769
- template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
770
- {
771
- Packet2d v[2], sum;
772
- v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
773
- v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
774
-
775
- sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
776
-
777
- return sum;
778
- }
779
-
780
- template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
781
- {
782
- PacketBlock<Packet4f,4> transpose;
783
- transpose.packet[0] = vecs[0];
784
- transpose.packet[1] = vecs[1];
785
- transpose.packet[2] = vecs[2];
786
- transpose.packet[3] = vecs[3];
787
- ptranspose(transpose);
788
-
789
- Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
790
- sum = padd(sum, transpose.packet[2]);
791
- sum = padd(sum, transpose.packet[3]);
792
- return sum;
793
- }
794
-
795
- // Other reduction functions:
796
- // mul
797
- template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
798
- {
799
- EIGEN_ALIGN16 int aux[4];
800
- pstore(aux, a);
801
- return aux[0] * aux[1] * aux[2] * aux[3];
802
- }
803
-
804
- template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
805
- {
806
- return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
807
- }
808
-
809
- template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
810
- {
1010
+ template <>
1011
+ EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
811
1012
  // Return predux_mul<Packet2d> of the subvectors product
812
1013
  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
813
1014
  }
814
1015
 
815
- // min
816
- template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
817
- {
818
- Packet4i b, res;
819
- b = pmin<Packet4i>(a, vec_sld(a, a, 8));
820
- res = pmin<Packet4i>(b, vec_sld(b, b, 4));
821
- return pfirst(res);
822
- }
823
-
824
- template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
825
- {
826
- return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
827
- }
828
-
829
- template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
830
- {
1016
+ template <>
1017
+ EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
831
1018
  Packet2d b, res;
832
- b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
833
- res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
1019
+ b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
1020
+ res = pmin<Packet2d>(
1021
+ b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
834
1022
  return static_cast<float>(pfirst(res));
835
1023
  }
836
1024
 
837
- // max
838
- template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
839
- {
840
- Packet4i b, res;
841
- b = pmax<Packet4i>(a, vec_sld(a, a, 8));
842
- res = pmax<Packet4i>(b, vec_sld(b, b, 4));
843
- return pfirst(res);
844
- }
845
-
846
- // max
847
- template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
848
- {
849
- return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
850
- }
851
-
852
- template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
853
- {
1025
+ template <>
1026
+ EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
854
1027
  Packet2d b, res;
855
- b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
856
- res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
1028
+ b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
1029
+ res = pmax<Packet2d>(
1030
+ b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
857
1031
  return static_cast<float>(pfirst(res));
858
1032
  }
859
1033
 
860
- EIGEN_DEVICE_FUNC inline void
861
- ptranspose(PacketBlock<Packet4i,4>& kernel) {
862
- Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
863
- Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
864
- Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
865
- Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
866
- kernel.packet[0] = vec_mergeh(t0, t2);
867
- kernel.packet[1] = vec_mergel(t0, t2);
868
- kernel.packet[2] = vec_mergeh(t1, t3);
869
- kernel.packet[3] = vec_mergel(t1, t3);
870
- }
871
-
872
- EIGEN_DEVICE_FUNC inline void
873
- ptranspose(PacketBlock<Packet2d,2>& kernel) {
874
- Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
875
- Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
876
- kernel.packet[0] = t0;
877
- kernel.packet[1] = t1;
878
- }
879
-
880
1034
  /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
881
1035
  */
882
- EIGEN_DEVICE_FUNC inline void
883
- ptranspose(PacketBlock<Packet4f,4>& kernel) {
884
- PacketBlock<Packet2d,2> t0,t1,t2,t3;
1036
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1037
+ PacketBlock<Packet2d, 2> t0, t1, t2, t3;
885
1038
  // copy top-left 2x2 Packet2d block
886
1039
  t0.packet[0] = kernel.packet[0].v4f[0];
887
1040
  t0.packet[1] = kernel.packet[1].v4f[0];
@@ -915,15 +1068,11 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
915
1068
  kernel.packet[3].v4f[1] = t3.packet[1];
916
1069
  }
917
1070
 
918
- template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
919
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
920
- Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
921
- return vec_sel(elsePacket, thenPacket, mask);
922
- }
923
-
924
- template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
925
- Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
926
- Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
1071
+ template <>
1072
+ EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
1073
+ const Packet4f& elsePacket) {
1074
+ Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
1075
+ Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
927
1076
  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
928
1077
  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
929
1078
  Packet4f result;
@@ -932,14 +1081,333 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
932
1081
  return result;
933
1082
  }
934
1083
 
935
- template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
936
- Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
937
- Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
1084
+ template <>
1085
+ Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1086
+ Packet4f res;
1087
+ res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
1088
+ res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
1089
+ return res;
1090
+ }
1091
+
1092
+ template <>
1093
+ Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1094
+ Packet4f res;
1095
+ res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
1096
+ res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
1097
+ return res;
1098
+ }
1099
+
1100
+ template <>
1101
+ Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1102
+ Packet4f res;
1103
+ res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
1104
+ res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
1105
+ return res;
1106
+ }
1107
+
1108
+ #else
1109
+ template <>
1110
+ EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1111
+ EIGEN_DEBUG_ALIGNED_LOAD
1112
+ return vec_xl(0, from);
1113
+ }
1114
+
1115
+ template <>
1116
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1117
+ EIGEN_DEBUG_ALIGNED_STORE
1118
+ vec_xst(from, 0, to);
1119
+ }
1120
+
1121
+ template <>
1122
+ EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
1123
+ return vec_splats(from);
1124
+ }
1125
+
1126
+ template <>
1127
+ EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1128
+ a3 = pload<Packet4f>(a);
1129
+ a0 = vec_splat(a3, 0);
1130
+ a1 = vec_splat(a3, 1);
1131
+ a2 = vec_splat(a3, 2);
1132
+ a3 = vec_splat(a3, 3);
1133
+ }
1134
+
1135
+ template <>
1136
+ EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1137
+ EIGEN_ALIGN16 float af[4];
1138
+ af[0] = from[0 * stride];
1139
+ af[1] = from[1 * stride];
1140
+ af[2] = from[2 * stride];
1141
+ af[3] = from[3 * stride];
1142
+ return pload<Packet4f>(af);
1143
+ }
1144
+
1145
+ template <>
1146
+ EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1147
+ EIGEN_ALIGN16 float af[4];
1148
+ pstore<float>((float*)af, from);
1149
+ to[0 * stride] = af[0];
1150
+ to[1 * stride] = af[1];
1151
+ to[2 * stride] = af[2];
1152
+ to[3 * stride] = af[3];
1153
+ }
1154
+
1155
+ template <>
1156
+ EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1157
+ return (a + b);
1158
+ }
1159
+ template <>
1160
+ EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1161
+ return (a - b);
1162
+ }
1163
+ template <>
1164
+ EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1165
+ return (a * b);
1166
+ }
1167
+ template <>
1168
+ EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1169
+ return (a / b);
1170
+ }
1171
+ template <>
1172
+ EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
1173
+ return (-a);
1174
+ }
1175
+ template <>
1176
+ EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
1177
+ return a;
1178
+ }
1179
+ template <>
1180
+ EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1181
+ return vec_madd(a, b, c);
1182
+ }
1183
+ template <>
1184
+ EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1185
+ return vec_min(a, b);
1186
+ }
1187
+ template <>
1188
+ EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1189
+ return vec_max(a, b);
1190
+ }
1191
+ template <>
1192
+ EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1193
+ return vec_and(a, b);
1194
+ }
1195
+ template <>
1196
+ EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1197
+ return vec_or(a, b);
1198
+ }
1199
+ template <>
1200
+ EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1201
+ return vec_xor(a, b);
1202
+ }
1203
+ template <>
1204
+ EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1205
+ return vec_and(a, vec_nor(b, b));
1206
+ }
1207
+ template <>
1208
+ EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1209
+ /* Uses non-default rounding for vec_round */
1210
+ return __builtin_s390_vfisb(a, 0, 1);
1211
+ }
1212
+ template <>
1213
+ EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1214
+ return vec_ceil(a);
1215
+ }
1216
+ template <>
1217
+ EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1218
+ return vec_floor(a);
1219
+ }
1220
+ template <>
1221
+ EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
1222
+ return vec_abs(a);
1223
+ }
1224
+ template <>
1225
+ EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1226
+ EIGEN_ALIGN16 float x[4];
1227
+ pstore(x, a);
1228
+ return x[0];
1229
+ }
1230
+
1231
+ template <>
1232
+ EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1233
+ Packet4f p = pload<Packet4f>(from);
1234
+ return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1235
+ }
1236
+
1237
+ template <>
1238
+ EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1239
+ return reinterpret_cast<Packet4f>(
1240
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1241
+ }
1242
+
1243
+ template <>
1244
+ EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1245
+ Packet4f b, sum;
1246
+ b = vec_sld(a, a, 8);
1247
+ sum = padd<Packet4f>(a, b);
1248
+ b = vec_sld(sum, sum, 4);
1249
+ sum = padd<Packet4f>(sum, b);
1250
+ return pfirst(sum);
1251
+ }
1252
+
1253
+ // Other reduction functions:
1254
+ // mul
1255
+ template <>
1256
+ EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
1257
+ Packet4f prod;
1258
+ prod = pmul(a, vec_sld(a, a, 8));
1259
+ return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1260
+ }
1261
+
1262
+ // min
1263
+ template <>
1264
+ EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
1265
+ Packet4f b, res;
1266
+ b = pmin<Packet4f>(a, vec_sld(a, a, 8));
1267
+ res = pmin<Packet4f>(b, vec_sld(b, b, 4));
1268
+ return pfirst(res);
1269
+ }
1270
+
1271
+ // max
1272
+ template <>
1273
+ EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
1274
+ Packet4f b, res;
1275
+ b = pmax<Packet4f>(a, vec_sld(a, a, 8));
1276
+ res = pmax<Packet4f>(b, vec_sld(b, b, 4));
1277
+ return pfirst(res);
1278
+ }
1279
+
1280
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1281
+ Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1282
+ Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1283
+ Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1284
+ Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1285
+ kernel.packet[0] = vec_mergeh(t0, t2);
1286
+ kernel.packet[1] = vec_mergel(t0, t2);
1287
+ kernel.packet[2] = vec_mergeh(t1, t3);
1288
+ kernel.packet[3] = vec_mergel(t1, t3);
1289
+ }
1290
+
1291
+ template <>
1292
+ EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
1293
+ const Packet4f& elsePacket) {
1294
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
1295
+ Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
938
1296
  return vec_sel(elsePacket, thenPacket, mask);
939
1297
  }
940
1298
 
941
- } // end namespace internal
1299
+ #endif
1300
+
1301
+ template <>
1302
+ EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1303
+ return pldexp_generic(a, exponent);
1304
+ }
1305
+
1306
+ template <>
1307
+ EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1308
+ // Clamp exponent to [-2099, 2099]
1309
+ const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1310
+ const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1311
+
1312
+ // Split 2^e into four factors and multiply:
1313
+ const Packet2l bias = {1023, 1023};
1314
+ Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
1315
+ Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
1316
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1317
+ b = psub(psub(psub(e, b), b), b); // e - 3b
1318
+ c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
1319
+ out = pmul(out, c); // a * 2^e
1320
+ return out;
1321
+ }
1322
+
1323
+ template <>
1324
+ EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1325
+ EIGEN_ZVECTOR_PREFETCH(addr);
1326
+ }
1327
+ template <>
1328
+ EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1329
+ return pload<Packet4f>(from);
1330
+ }
1331
+ template <>
1332
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1333
+ pstore<float>(to, from);
1334
+ }
1335
+ template <>
1336
+ EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
1337
+ return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
1338
+ }
1339
+
1340
+ #if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
1341
+ #pragma GCC warning \
1342
+ "float->int and int->float conversion is simulated. compile for z15 for improved performance"
1343
+ template <>
1344
+ struct cast_impl<Packet4i, Packet4f> {
1345
+ EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
1346
+ return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3]) };
1347
+ }
1348
+ };
1349
+
1350
+ template <>
1351
+ struct cast_impl<Packet4f, Packet4i> {
1352
+ EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
1353
+ return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3]) };
1354
+ }
1355
+ };
1356
+
1357
+ template <>
1358
+ struct cast_impl<Packet2l, Packet2d> {
1359
+ EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
1360
+ return Packet2d{double(a[0]), double(a[1]) };
1361
+ }
1362
+ };
1363
+
1364
+ template <>
1365
+ struct cast_impl<Packet2d, Packet2l> {
1366
+ EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
1367
+ return Packet2l{(long long)(a[0]), (long long)(a[1]) };
1368
+ }
1369
+ };
1370
+ #else
1371
+ template <>
1372
+ struct cast_impl<Packet4i, Packet4f> {
1373
+ EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
1374
+ return vec_float(a);
1375
+ }
1376
+ };
1377
+
1378
+ template <>
1379
+ struct cast_impl<Packet4f, Packet4i> {
1380
+ EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
1381
+ return vec_signed(a);
1382
+ }
1383
+ };
1384
+
1385
+ template <>
1386
+ struct cast_impl<Packet2l, Packet2d> {
1387
+ EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
1388
+ return vec_double(a);
1389
+ }
1390
+ };
1391
+
1392
+ template <>
1393
+ struct cast_impl<Packet2d, Packet2l> {
1394
+ EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
1395
+ return vec_signed(a);
1396
+ }
1397
+ };
1398
+ #endif
1399
+
1400
+ template <>
1401
+ EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
1402
+ return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
1403
+ }
1404
+ template <>
1405
+ EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
1406
+ return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
1407
+ }
1408
+
1409
+ } // end namespace internal
942
1410
 
943
- } // end namespace Eigen
1411
+ } // end namespace Eigen
944
1412
 
945
- #endif // EIGEN_PACKET_MATH_ZVECTOR_H
1413
+ #endif // EIGEN_PACKET_MATH_ZVECTOR_H