@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -0,0 +1,2634 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2007 Julien Pommier
5
+ // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
6
+ // Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
7
+ //
8
+ // This Source Code Form is subject to the terms of the Mozilla
9
+ // Public License v. 2.0. If a copy of the MPL was not distributed
10
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
+
12
+ /* The exp and log functions of this file initially come from
13
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
14
+ */
15
+
16
+ #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
17
+ #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
18
+
19
+ // IWYU pragma: private
20
+ #include "../../InternalHeaderCheck.h"
21
+
22
+ namespace Eigen {
23
+ namespace internal {
24
+
25
+ // Creates a Scalar integer type with same bit-width.
26
+ template <typename T>
27
+ struct make_integer;
28
+ template <>
29
+ struct make_integer<float> {
30
+ typedef numext::int32_t type;
31
+ };
32
+ template <>
33
+ struct make_integer<double> {
34
+ typedef numext::int64_t type;
35
+ };
36
+ template <>
37
+ struct make_integer<half> {
38
+ typedef numext::int16_t type;
39
+ };
40
+ template <>
41
+ struct make_integer<bfloat16> {
42
+ typedef numext::int16_t type;
43
+ };
44
+
45
+ /* polevl (modified for Eigen)
46
+ *
47
+ * Evaluate polynomial
48
+ *
49
+ *
50
+ *
51
+ * SYNOPSIS:
52
+ *
53
+ * int N;
54
+ * Scalar x, y, coef[N+1];
55
+ *
56
+ * y = polevl<decltype(x), N>( x, coef);
57
+ *
58
+ *
59
+ *
60
+ * DESCRIPTION:
61
+ *
62
+ * Evaluates polynomial of degree N:
63
+ *
64
+ * 2 N
65
+ * y = C + C x + C x +...+ C x
66
+ * 0 1 2 N
67
+ *
68
+ * Coefficients are stored in reverse order:
69
+ *
70
+ * coef[0] = C , ..., coef[N] = C .
71
+ * N 0
72
+ *
73
+ * The function p1evl() assumes that coef[N] = 1.0 and is
74
+ * omitted from the array. Its calling arguments are
75
+ * otherwise the same as polevl().
76
+ *
77
+ *
78
+ * The Eigen implementation is templatized. For best speed, store
79
+ * coef as a const array (constexpr), e.g.
80
+ *
81
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
82
+ *
83
+ */
84
+ template <typename Packet, int N>
85
+ struct ppolevl {
86
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
87
+ const typename unpacket_traits<Packet>::type coeff[]) {
88
+ EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
89
+ return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
90
+ }
91
+ };
92
+
93
+ template <typename Packet>
94
+ struct ppolevl<Packet, 0> {
95
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
96
+ const typename unpacket_traits<Packet>::type coeff[]) {
97
+ EIGEN_UNUSED_VARIABLE(x);
98
+ return pset1<Packet>(coeff[0]);
99
+ }
100
+ };
101
+
102
+ /* chbevl (modified for Eigen)
103
+ *
104
+ * Evaluate Chebyshev series
105
+ *
106
+ *
107
+ *
108
+ * SYNOPSIS:
109
+ *
110
+ * int N;
111
+ * Scalar x, y, coef[N], chebevl();
112
+ *
113
+ * y = chbevl( x, coef, N );
114
+ *
115
+ *
116
+ *
117
+ * DESCRIPTION:
118
+ *
119
+ * Evaluates the series
120
+ *
121
+ * N-1
122
+ * - '
123
+ * y = > coef[i] T (x/2)
124
+ * - i
125
+ * i=0
126
+ *
127
+ * of Chebyshev polynomials Ti at argument x/2.
128
+ *
129
+ * Coefficients are stored in reverse order, i.e. the zero
130
+ * order term is last in the array. Note N is the number of
131
+ * coefficients, not the order.
132
+ *
133
+ * If coefficients are for the interval a to b, x must
134
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
135
+ * entering the routine. This maps x from (a, b) to (-1, 1),
136
+ * over which the Chebyshev polynomials are defined.
137
+ *
138
+ * If the coefficients are for the inverted interval, in
139
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
140
+ * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
141
+ * this becomes x -> 4a/x - 1.
142
+ *
143
+ *
144
+ *
145
+ * SPEED:
146
+ *
147
+ * Taking advantage of the recurrence properties of the
148
+ * Chebyshev polynomials, the routine requires one more
149
+ * addition per loop than evaluating a nested polynomial of
150
+ * the same degree.
151
+ *
152
+ */
153
+
154
+ template <typename Packet, int N>
155
+ struct pchebevl {
156
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
157
+ const typename unpacket_traits<Packet>::type coef[]) {
158
+ typedef typename unpacket_traits<Packet>::type Scalar;
159
+ Packet b0 = pset1<Packet>(coef[0]);
160
+ Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
161
+ Packet b2;
162
+
163
+ for (int i = 1; i < N; i++) {
164
+ b2 = b1;
165
+ b1 = b0;
166
+ b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
167
+ }
168
+
169
+ return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
170
+ }
171
+ };
172
+
173
+ template <typename Packet>
174
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
175
+ typedef typename unpacket_traits<Packet>::type Scalar;
176
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
177
+ static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
178
+ return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
179
+ }
180
+
181
+ // Safely applies frexp, correctly handles denormals.
182
+ // Assumes IEEE floating point format.
183
+ template <typename Packet>
184
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
185
+ typedef typename unpacket_traits<Packet>::type Scalar;
186
+ typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
187
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
188
+ ExponentBits = TotalBits - MantissaBits - 1;
189
+
190
+ constexpr ScalarUI scalar_sign_mantissa_mask =
191
+ ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
192
+ const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
193
+ const Packet half = pset1<Packet>(Scalar(0.5));
194
+ const Packet zero = pzero(a);
195
+ const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
196
+
197
+ // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
198
+ const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
199
+ constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
200
+ // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
201
+ const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
202
+ const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
203
+ const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
204
+
205
+ // Determine exponent offset: -126 if normal, -126-24 if denormal
206
+ const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2)); // -126
207
+ Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
208
+ const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
209
+ exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
210
+
211
+ // Determine exponent and mantissa from normalized_a.
212
+ exponent = pfrexp_generic_get_biased_exponent(normalized_a);
213
+ // Zero, Inf and NaN return 'a' unmodified, exponent is zero
214
+ // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
215
+ const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255
216
+ const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
217
+ const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
218
+ const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
219
+ exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
220
+ return m;
221
+ }
222
+
223
+ // Safely applies ldexp, correctly handles overflows, underflows and denormals.
224
+ // Assumes IEEE floating point format.
225
+ template <typename Packet>
226
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
227
+ // We want to return a * 2^exponent, allowing for all possible integer
228
+ // exponents without overflowing or underflowing in intermediate
229
+ // computations.
230
+ //
231
+ // Since 'a' and the output can be denormal, the maximum range of 'exponent'
232
+ // to consider for a float is:
233
+ // -255-23 -> 255+23
234
+ // Below -278 any finite float 'a' will become zero, and above +278 any
235
+ // finite float will become inf, including when 'a' is the smallest possible
236
+ // denormal.
237
+ //
238
+ // Unfortunately, 2^(278) cannot be represented using either one or two
239
+ // finite normal floats, so we must split the scale factor into at least
240
+ // three parts. It turns out to be faster to split 'exponent' into four
241
+ // factors, since [exponent>>2] is much faster to compute that [exponent/3].
242
+ //
243
+ // Set e = min(max(exponent, -278), 278);
244
+ // b = floor(e/4);
245
+ // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
246
+ //
247
+ // This will avoid any intermediate overflows and correctly handle 0, inf,
248
+ // NaN cases.
249
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
250
+ typedef typename unpacket_traits<Packet>::type Scalar;
251
+ typedef typename unpacket_traits<PacketI>::type ScalarI;
252
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
253
+ ExponentBits = TotalBits - MantissaBits - 1;
254
+
255
+ const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1))); // 278
256
+ const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)); // 127
257
+ const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
258
+ PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
259
+ Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^b
260
+ Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
261
+ b = pnmadd(pset1<PacketI>(3), b, e); // e - 3b
262
+ c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^(e-3*b)
263
+ out = pmul(out, c);
264
+ return out;
265
+ }
266
+
267
+ // Explicitly multiplies
268
+ // a * (2^e)
269
+ // clamping e to the range
270
+ // [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
271
+ //
272
+ // This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
273
+ // if 2^e doesn't fit into a normal floating-point Scalar.
274
+ //
275
+ // Assumes IEEE floating point format
276
+ template <typename Packet>
277
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
278
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
279
+ typedef typename unpacket_traits<Packet>::type Scalar;
280
+ typedef typename unpacket_traits<PacketI>::type ScalarI;
281
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
282
+ ExponentBits = TotalBits - MantissaBits - 1;
283
+
284
+ const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127
285
+ const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255
286
+ // restrict biased exponent between 0 and 255 for float.
287
+ const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
288
+ // return a * (2^e)
289
+ return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
290
+ }
291
+
292
+ // This function implements a single step of Halley's iteration for
293
+ // computing x = y^(1/3):
294
+ // x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
295
+ template <typename Packet>
296
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
297
+ const Packet& y) {
298
+ typedef typename unpacket_traits<Packet>::type Scalar;
299
+ Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
300
+ Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
301
+ Packet num = psub(x_k_cb, y);
302
+ Packet r = pdiv(num, denom);
303
+ return pnmadd(x_k, r, x_k);
304
+ }
305
+
306
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
307
+ // interval [0.125,1].
308
+ template <typename Packet>
309
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
310
+ typedef typename unpacket_traits<Packet>::type Scalar;
311
+ // Extract the significant s in the range [0.5,1) and exponent e, such that
312
+ // x = 2^e * s.
313
+ Packet e, s;
314
+ s = pfrexp(x, e);
315
+
316
+ // Split the exponent into a part divisible by 3 and the remainder.
317
+ // e = 3*e_div3 + e_mod3.
318
+ constexpr Scalar kOneThird = Scalar(1) / 3;
319
+ e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
320
+ Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
321
+
322
+ // Replace s by y = (s * 2^e_mod3).
323
+ return pldexp_fast(s, e_mod3);
324
+ }
325
+
326
+ template <typename Packet>
327
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
328
+ const Packet& abs_root) {
329
+ typedef typename unpacket_traits<Packet>::type Scalar;
330
+
331
+ // Set sign.
332
+ const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
333
+ const Packet x_sign = pand(sign_mask, x);
334
+ Packet root = por(x_sign, abs_root);
335
+
336
+ // Pass non-finite and zero values of x straight through.
337
+ const Packet is_not_finite = por(pisinf(x), pisnan(x));
338
+ const Packet is_zero = pcmp_eq(pzero(x), x);
339
+ const Packet use_x = por(is_not_finite, is_zero);
340
+ return pselect(use_x, x, root);
341
+ }
342
+
343
+ // Generic implementation of cbrt(x) for float.
344
+ //
345
+ // The algorithm computes the cubic root of the input by first
346
+ // decomposing it into a exponent and significant
347
+ // x = s * 2^e.
348
+ //
349
+ // We can then write the cube root as
350
+ //
351
+ // x^(1/3) = 2^(e/3) * s^(1/3)
352
+ // = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
353
+ // = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
354
+ // = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
355
+ //
356
+ // where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
357
+ //
358
+ // The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
359
+ // approximated using a cubic polynomial and subsequently refined using a
360
+ // single step of Halley's iteration, and finally the two terms are combined
361
+ // using pldexp_fast.
362
+ //
363
+ // Note: Many alternatives exist for implementing cbrt. See, for example,
364
+ // the excellent discussion in Kahan's note:
365
+ // https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
366
+ // This particular implementation was found to be very fast and accurate
367
+ // among several alternatives tried, but is probably not "optimal" on all
368
+ // platforms.
369
+ //
370
+ // This is accurate to 2 ULP.
371
+ template <typename Packet>
372
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
373
+ typedef typename unpacket_traits<Packet>::type Scalar;
374
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
375
+
376
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
377
+ // interval [0.125,1].
378
+ Packet e_div3;
379
+ const Packet y = cbrt_decompose(pabs(x), e_div3);
380
+
381
+ // Compute initial approximation accurate to 5.22e-3.
382
+ // The polynomial was computed using Rminimax.
383
+ constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
384
+ 3.408401906490325927734375e-01f};
385
+ Packet r = ppolevl<Packet, 3>::run(y, alpha);
386
+
387
+ // Take one step of Halley's iteration.
388
+ r = cbrt_halley_iteration_step(r, y);
389
+
390
+ // Finally multiply by 2^(e_div3)
391
+ r = pldexp_fast(r, e_div3);
392
+
393
+ return cbrt_special_cases_and_sign(x, r);
394
+ }
395
+
396
+ // Generic implementation of cbrt(x) for double.
397
+ //
398
+ // The algorithm is identical to the one for float except that a different initial
399
+ // approximation is used for y^(1/3) and two Halley iteration steps are peformed.
400
+ //
401
+ // This is accurate to 1 ULP.
402
+ template <typename Packet>
403
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
404
+ typedef typename unpacket_traits<Packet>::type Scalar;
405
+ static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
406
+
407
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
408
+ // interval [0.125,1].
409
+ Packet e_div3;
410
+ const Packet y = cbrt_decompose(pabs(x), e_div3);
411
+
412
+ // Compute initial approximation accurate to 0.016.
413
+ // The polynomial was computed using Rminimax.
414
+ constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
415
+ 1.072314636518546304699839311069808900356292724609375e+00,
416
+ 3.81249427609571867048288140722434036433696746826171875e-01};
417
+ Packet r = ppolevl<Packet, 2>::run(y, alpha);
418
+
419
+ // Take two steps of Halley's iteration.
420
+ r = cbrt_halley_iteration_step(r, y);
421
+ r = cbrt_halley_iteration_step(r, y);
422
+
423
+ // Finally multiply by 2^(e_div3).
424
+ r = pldexp_fast(r, e_div3);
425
+ return cbrt_special_cases_and_sign(x, r);
426
+ }
427
+
428
+ // Natural or base 2 logarithm.
429
+ // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
430
+ // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
431
+ // be easily approximated by a polynomial centered on m=1 for stability.
432
+ // TODO(gonnet): Further reduce the interval allowing for lower-degree
433
+ // polynomial interpolants -> ... -> profit!
434
+ template <typename Packet, bool base2>
435
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
436
+ const Packet cst_1 = pset1<Packet>(1.0f);
437
+ const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
438
+ const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
439
+
440
+ const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
441
+ Packet e, x;
442
+ // extract significant in the range [0.5,1) and exponent
443
+ x = pfrexp(_x, e);
444
+
445
+ // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
446
+ // and shift by -1. The values are then centered around 0, which improves
447
+ // the stability of the polynomial evaluation.
448
+ // if( x < SQRTHF ) {
449
+ // e -= 1;
450
+ // x = x + x - 1.0;
451
+ // } else { x = x - 1.0; }
452
+ Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
453
+ Packet tmp = pand(x, mask);
454
+ x = psub(x, cst_1);
455
+ e = psub(e, pand(cst_1, mask));
456
+ x = padd(x, tmp);
457
+
458
+ // Polynomial coefficients for rational r(x) = p(x)/q(x)
459
+ // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
460
+ constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
461
+ constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
462
+
463
+ Packet p = ppolevl<Packet, 2>::run(x, alpha);
464
+ p = pmul(x, p);
465
+ Packet q = ppolevl<Packet, 3>::run(x, beta);
466
+ x = pdiv(p, q);
467
+
468
+ // Add the logarithm of the exponent back to the result of the interpolation.
469
+ if (base2) {
470
+ const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
471
+ x = pmadd(x, cst_log2e, e);
472
+ } else {
473
+ const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
474
+ x = pmadd(e, cst_ln2, x);
475
+ }
476
+
477
+ Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
478
+ Packet iszero_mask = pcmp_eq(_x, pzero(_x));
479
+ Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
480
+ // Filter out invalid inputs, i.e.:
481
+ // - negative arg will be NAN
482
+ // - 0 will be -INF
483
+ // - +INF will be +INF
484
+ return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
485
+ }
486
+
487
+ template <typename Packet>
488
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) {
489
+ return plog_impl_float<Packet, /* base2 */ false>(_x);
490
+ }
491
+
492
+ template <typename Packet>
493
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) {
494
+ return plog_impl_float<Packet, /* base2 */ true>(_x);
495
+ }
496
+
497
+ /* Returns the base e (2.718...) or base 2 logarithm of x.
498
+ * The argument is separated into its exponent and fractional parts.
499
+ * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
500
+ * is approximated by
501
+ *
502
+ * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
503
+ *
504
+ * for more detail see: http://www.netlib.org/cephes/
505
+ */
506
+ template <typename Packet, bool base2>
507
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
508
+ Packet x = _x;
509
+
510
+ const Packet cst_1 = pset1<Packet>(1.0);
511
+ const Packet cst_neg_half = pset1<Packet>(-0.5);
512
+ const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
513
+ const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
514
+
515
+ // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
516
+ // 1/sqrt(2) <= x < sqrt(2)
517
+ const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
518
+ const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
519
+ const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
520
+ const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
521
+ const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
522
+ const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
523
+ const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
524
+
525
+ const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
526
+ const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
527
+ const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
528
+ const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
529
+ const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
530
+ const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
531
+
532
+ Packet e;
533
+ // extract significant in the range [0.5,1) and exponent
534
+ x = pfrexp(x, e);
535
+
536
+ // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
537
+ // and shift by -1. The values are then centered around 0, which improves
538
+ // the stability of the polynomial evaluation.
539
+ // if( x < SQRTHF ) {
540
+ // e -= 1;
541
+ // x = x + x - 1.0;
542
+ // } else { x = x - 1.0; }
543
+ Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
544
+ Packet tmp = pand(x, mask);
545
+ x = psub(x, cst_1);
546
+ e = psub(e, pand(cst_1, mask));
547
+ x = padd(x, tmp);
548
+
549
+ Packet x2 = pmul(x, x);
550
+ Packet x3 = pmul(x2, x);
551
+
552
+ // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
553
+ // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
554
+ Packet y, y1, y_;
555
+ y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
556
+ y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
557
+ y = pmadd(y, x, cst_cephes_log_p2);
558
+ y1 = pmadd(y1, x, cst_cephes_log_p5);
559
+ y_ = pmadd(y, x3, y1);
560
+
561
+ y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
562
+ y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
563
+ y = pmadd(y, x, cst_cephes_log_q2);
564
+ y1 = pmadd(y1, x, cst_cephes_log_q5);
565
+ y = pmadd(y, x3, y1);
566
+
567
+ y_ = pmul(y_, x3);
568
+ y = pdiv(y_, y);
569
+
570
+ y = pmadd(cst_neg_half, x2, y);
571
+ x = padd(x, y);
572
+
573
+ // Add the logarithm of the exponent back to the result of the interpolation.
574
+ if (base2) {
575
+ const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
576
+ x = pmadd(x, cst_log2e, e);
577
+ } else {
578
+ const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
579
+ x = pmadd(e, cst_ln2, x);
580
+ }
581
+
582
+ Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
583
+ Packet iszero_mask = pcmp_eq(_x, pzero(_x));
584
+ Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
585
+ // Filter out invalid inputs, i.e.:
586
+ // - negative arg will be NAN
587
+ // - 0 will be -INF
588
+ // - +INF will be +INF
589
+ return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
590
+ }
591
+
592
+ template <typename Packet>
593
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) {
594
+ return plog_impl_double<Packet, /* base2 */ false>(_x);
595
+ }
596
+
597
+ template <typename Packet>
598
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) {
599
+ return plog_impl_double<Packet, /* base2 */ true>(_x);
600
+ }
601
+
602
+ /** \internal \returns log(1 + x) computed using W. Kahan's formula.
603
+ See: http://www.plunk.org/~hatch/rightway.php
604
+ */
605
+ template <typename Packet>
606
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x) {
607
+ typedef typename unpacket_traits<Packet>::type ScalarType;
608
+ const Packet one = pset1<Packet>(ScalarType(1));
609
+ Packet xp1 = padd(x, one);
610
+ Packet small_mask = pcmp_eq(xp1, one);
611
+ Packet log1 = plog(xp1);
612
+ Packet inf_mask = pcmp_eq(xp1, log1);
613
+ Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
614
+ return pselect(por(small_mask, inf_mask), x, log_large);
615
+ }
616
+
617
+ /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
618
+ See: http://www.plunk.org/~hatch/rightway.php
619
+ */
620
+ template <typename Packet>
621
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x) {
622
+ typedef typename unpacket_traits<Packet>::type ScalarType;
623
+ const Packet one = pset1<Packet>(ScalarType(1));
624
+ const Packet neg_one = pset1<Packet>(ScalarType(-1));
625
+ Packet u = pexp(x);
626
+ Packet one_mask = pcmp_eq(u, one);
627
+ Packet u_minus_one = psub(u, one);
628
+ Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
629
+ Packet logu = plog(u);
630
+ // The following comparison is to catch the case where
631
+ // exp(x) = +inf. It is written in this way to avoid having
632
+ // to form the constant +inf, which depends on the packet
633
+ // type.
634
+ Packet pos_inf_mask = pcmp_eq(logu, u);
635
+ Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
636
+ expm1 = pselect(pos_inf_mask, u, expm1);
637
+ return pselect(one_mask, x, pselect(neg_one_mask, neg_one, expm1));
638
+ }
639
+
640
+ // Exponential function. Works by writing "x = m*log(2) + r" where
641
+ // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
642
+ // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
643
+ // exp(r) is computed using a 6th order minimax polynomial approximation.
644
+ template <typename Packet>
645
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
646
+ const Packet cst_zero = pset1<Packet>(0.0f);
647
+ const Packet cst_one = pset1<Packet>(1.0f);
648
+ const Packet cst_half = pset1<Packet>(0.5f);
649
+ const Packet cst_exp_hi = pset1<Packet>(88.723f);
650
+ const Packet cst_exp_lo = pset1<Packet>(-104.f);
651
+ const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
652
+
653
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
654
+ const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
655
+ const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
656
+ const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
657
+ const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
658
+ const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
659
+
660
+ // Clamp x.
661
+ Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
662
+ Packet x = pmin(_x, cst_exp_hi);
663
+
664
+ // Express exp(x) as exp(m*ln(2) + r), start by extracting
665
+ // m = floor(x/ln(2) + 0.5).
666
+ Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
667
+
668
+ // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
669
+ // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
670
+ // truncation errors.
671
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
672
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
673
+ Packet r = pmadd(m, cst_cephes_exp_C1, x);
674
+ r = pmadd(m, cst_cephes_exp_C2, r);
675
+
676
+ // Evaluate the 6th order polynomial approximation to exp(r)
677
+ // with r in the interval [-ln(2)/2;ln(2)/2].
678
+ const Packet r2 = pmul(r, r);
679
+ Packet p_even = pmadd(r2, cst_p6, cst_p4);
680
+ const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
681
+ p_even = pmadd(r2, p_even, cst_p2);
682
+ const Packet p_low = padd(r, cst_one);
683
+ Packet y = pmadd(r, p_odd, p_even);
684
+ y = pmadd(r2, y, p_low);
685
+
686
+ // Return 2^m * exp(r).
687
+ const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
688
+ if (!predux_any(fast_pldexp_unsafe)) {
689
+ // For |x| <= 87, we know the result is not zero or inf, and we can safely use
690
+ // the fast version of pldexp.
691
+ return pmax(pldexp_fast(y, m), _x);
692
+ }
693
+ return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
694
+ }
695
+
696
+ template <typename Packet>
697
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
698
+ Packet x = _x;
699
+ const Packet cst_zero = pset1<Packet>(0.0);
700
+ const Packet cst_1 = pset1<Packet>(1.0);
701
+ const Packet cst_2 = pset1<Packet>(2.0);
702
+ const Packet cst_half = pset1<Packet>(0.5);
703
+
704
+ const Packet cst_exp_hi = pset1<Packet>(709.784);
705
+ const Packet cst_exp_lo = pset1<Packet>(-745.519);
706
+ const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
707
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
708
+ const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
709
+ const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
710
+ const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
711
+ const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
712
+ const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
713
+ const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
714
+ const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
715
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
716
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
717
+
718
+ Packet tmp, fx;
719
+
720
+ // clamp x
721
+ Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
722
+ x = pmin(x, cst_exp_hi);
723
+ // Express exp(x) as exp(g + n*log(2)).
724
+ fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
725
+
726
+ // Get the integer modulus of log(2), i.e. the "n" described above.
727
+ fx = pfloor(fx);
728
+
729
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
730
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
731
+ // digits right.
732
+ tmp = pmul(fx, cst_cephes_exp_C1);
733
+ Packet z = pmul(fx, cst_cephes_exp_C2);
734
+ x = psub(x, tmp);
735
+ x = psub(x, z);
736
+
737
+ Packet x2 = pmul(x, x);
738
+
739
+ // Evaluate the numerator polynomial of the rational interpolant.
740
+ Packet px = cst_cephes_exp_p0;
741
+ px = pmadd(px, x2, cst_cephes_exp_p1);
742
+ px = pmadd(px, x2, cst_cephes_exp_p2);
743
+ px = pmul(px, x);
744
+
745
+ // Evaluate the denominator polynomial of the rational interpolant.
746
+ Packet qx = cst_cephes_exp_q0;
747
+ qx = pmadd(qx, x2, cst_cephes_exp_q1);
748
+ qx = pmadd(qx, x2, cst_cephes_exp_q2);
749
+ qx = pmadd(qx, x2, cst_cephes_exp_q3);
750
+
751
+ // I don't really get this bit, copied from the SSE2 routines, so...
752
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
753
+ // rational interpolant?
754
+ x = pdiv(px, psub(qx, px));
755
+ x = pmadd(cst_2, x, cst_1);
756
+
757
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
758
+ // non-finite values in the input.
759
+ const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
760
+ if (!predux_any(fast_pldexp_unsafe)) {
761
+ // For |x| <= 708, we know the result is not zero or inf, and we can safely use
762
+ // the fast version of pldexp.
763
+ return pmax(pldexp_fast(x, fx), _x);
764
+ }
765
+ return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
766
+ }
767
+
768
+ // The following code is inspired by the following stack-overflow answer:
769
+ // https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
770
+ // It has been largely optimized:
771
+ // - By-pass calls to frexp.
772
+ // - Aligned loads of required 96 bits of 2/pi. This is accomplished by
773
+ // (1) balancing the mantissa and exponent to the required bits of 2/pi are
774
+ // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
775
+ // - Avoid a branch in rounding and extraction of the remaining fractional part.
776
+ // Overall, I measured a speed up higher than x2 on x86-64.
777
+ inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
778
+ using Eigen::numext::int32_t;
779
+ using Eigen::numext::int64_t;
780
+ using Eigen::numext::uint32_t;
781
+ using Eigen::numext::uint64_t;
782
+
783
+ const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
784
+ const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
785
+
786
+ // 192 bits of 2/pi for Payne-Hanek reduction
787
+ // Bits are introduced by packet of 8 to enable aligned reads.
788
+ static const uint32_t two_over_pi[] = {
789
+ 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
790
+ 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
791
+ 0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
792
+
793
+ uint32_t xi = numext::bit_cast<uint32_t>(xf);
794
+ // Below, -118 = -126 + 8.
795
+ // -126 is to get the exponent,
796
+ // +8 is to enable alignment of 2/pi's bits on 8 bits.
797
+ // This is possible because the fractional part of x as only 24 meaningful bits.
798
+ uint32_t e = (xi >> 23) - 118;
799
+ // Extract the mantissa and shift it to align it wrt the exponent
800
+ xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
801
+
802
+ uint32_t i = e >> 3;
803
+ uint32_t twoopi_1 = two_over_pi[i - 1];
804
+ uint32_t twoopi_2 = two_over_pi[i + 3];
805
+ uint32_t twoopi_3 = two_over_pi[i + 7];
806
+
807
+ // Compute x * 2/pi in 2.62-bit fixed-point format.
808
+ uint64_t p;
809
+ p = uint64_t(xi) * twoopi_3;
810
+ p = uint64_t(xi) * twoopi_2 + (p >> 32);
811
+ p = (uint64_t(xi * twoopi_1) << 32) + p;
812
+
813
+ // Round to nearest: add 0.5 and extract integral part.
814
+ uint64_t q = (p + zero_dot_five) >> 62;
815
+ *quadrant = int(q);
816
+ // Now it remains to compute "r = x - q*pi/2" with high accuracy,
817
+ // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
818
+ // r = (p-q)*pi/2,
819
+ // where the product can be be carried out with sufficient accuracy using double precision.
820
+ p -= q << 62;
821
+ return float(double(int64_t(p)) * pio2_62);
822
+ }
823
+
824
+ template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
825
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
826
+ #if EIGEN_COMP_GNUC_STRICT
827
+ __attribute__((optimize("-fno-unsafe-math-optimizations")))
828
+ #endif
829
+ Packet
830
+ psincos_float(const Packet& _x) {
831
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
832
+
833
+ const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
834
+ const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
835
+ const PacketI csti_1 = pset1<PacketI>(1);
836
+ const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
837
+
838
+ Packet x = pabs(_x);
839
+
840
+ // Scale x by 2/Pi to find x's octant.
841
+ Packet y = pmul(x, cst_2oPI);
842
+
843
+ // Rounding trick to find nearest integer:
844
+ Packet y_round = padd(y, cst_rounding_magic);
845
+ EIGEN_OPTIMIZATION_BARRIER(y_round)
846
+ PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
847
+ y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
848
+
849
+ // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
850
+ // using "Extended precision modular arithmetic"
851
+ #if defined(EIGEN_VECTORIZE_FMA)
852
+ // This version requires true FMA for high accuracy.
853
+ // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
854
+ const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
855
+ x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
856
+ x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
857
+ x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
858
+ #else
859
+ // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
860
+ // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
861
+ // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
862
+
863
+ // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
864
+ // and 2 ULP up to:
865
+ const float huge_th = ComputeSine ? 25966.f : 18838.f;
866
+ x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
867
+ EIGEN_OPTIMIZATION_BARRIER(x)
868
+ x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
869
+ EIGEN_OPTIMIZATION_BARRIER(x)
870
+ x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
871
+ x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
872
+
873
+ // For the record, the following set of coefficients maintain 2ULP up
874
+ // to a slightly larger range:
875
+ // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
876
+ // but it slightly fails to maintain 1ULP for two values of sin below pi.
877
+ // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
878
+ // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
879
+ // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
880
+ // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
881
+
882
+ // For the record, with only 3 iterations it is possible to maintain
883
+ // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
884
+ // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
885
+ #endif
886
+
887
+ if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
888
+ const int PacketSize = unpacket_traits<Packet>::size;
889
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
890
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
891
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
892
+ pstoreu(vals, pabs(_x));
893
+ pstoreu(x_cpy, x);
894
+ pstoreu(y_int2, y_int);
895
+ for (int k = 0; k < PacketSize; ++k) {
896
+ float val = vals[k];
897
+ if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
898
+ }
899
+ x = ploadu<Packet>(x_cpy);
900
+ y_int = ploadu<PacketI>(y_int2);
901
+ }
902
+
903
+ // Compute the sign to apply to the polynomial.
904
+ // sin: sign = second_bit(y_int) xor signbit(_x)
905
+ // cos: sign = second_bit(y_int+1)
906
+ Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
907
+ : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
908
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
909
+
910
+ // Get the polynomial selection mask from the second bit of y_int
911
+ // We'll calculate both (sin and cos) polynomials and then select from the two.
912
+ Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
913
+
914
+ Packet x2 = pmul(x, x);
915
+
916
+ // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
917
+ Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
918
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
919
+ y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
920
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
921
+ y1 = pmadd(y1, x2, pset1<Packet>(1.f));
922
+
923
+ // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
924
+ // octave/matlab code to compute those coefficients:
925
+ // x = (0:0.0001:pi/4)';
926
+ // A = [x.^3 x.^5 x.^7];
927
+ // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
928
+ // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
929
+ // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
930
+ //
931
+ Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
932
+ y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
933
+ y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
934
+ y2 = pmul(y2, x2);
935
+ y2 = pmadd(y2, x, x);
936
+
937
+ // Select the correct result from the two polynomials.
938
+ if (ComputeBoth) {
939
+ Packet peven = peven_mask(x);
940
+ Packet ysin = pselect(poly_mask, y2, y1);
941
+ Packet ycos = pselect(poly_mask, y1, y2);
942
+ Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
943
+ Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
944
+ sign_bit_sin = pand(sign_bit_sin, cst_sign_mask); // clear all but left most bit
945
+ sign_bit_cos = pand(sign_bit_cos, cst_sign_mask); // clear all but left most bit
946
+ y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
947
+ } else {
948
+ y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
949
+ y = pxor(y, sign_bit);
950
+ }
951
+ // Update the sign and filter huge inputs
952
+ return y;
953
+ }
954
+
955
+ template <typename Packet>
956
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
957
+ return psincos_float<true>(x);
958
+ }
959
+
960
+ template <typename Packet>
961
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
962
+ return psincos_float<false>(x);
963
+ }
964
+
965
+ // Trigonometric argument reduction for double for inputs smaller than 15.
966
+ // Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
967
+ // count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
968
+ template <typename Packet>
969
+ Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
970
+ // Pi/2 split into 2 values
971
+ const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
972
+ const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
973
+
974
+ Packet t;
975
+ t = pmadd(cst_pio2_a, q, x);
976
+ t = pmadd(cst_pio2_b, q, t);
977
+ return t;
978
+ }
979
+
980
+ // Trigonometric argument reduction for double for inputs smaller than 1e14.
981
+ // Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
982
+ // count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
983
+ template <typename Packet>
984
+ Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
985
+ // Pi/2 split into 4 values
986
+ const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
987
+ const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
988
+ const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
989
+ const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
990
+
991
+ Packet t;
992
+ t = pmadd(cst_pio2_a, q_high, x);
993
+ t = pmadd(cst_pio2_a, q_low, t);
994
+ t = pmadd(cst_pio2_b, q_high, t);
995
+ t = pmadd(cst_pio2_b, q_low, t);
996
+ t = pmadd(cst_pio2_c, q_high, t);
997
+ t = pmadd(cst_pio2_c, q_low, t);
998
+ t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
999
+ return t;
1000
+ }
1001
+
1002
+ template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
1003
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
1004
+ #if EIGEN_COMP_GNUC_STRICT
1005
+ __attribute__((optimize("-fno-unsafe-math-optimizations")))
1006
+ #endif
1007
+ Packet
1008
+ psincos_double(const Packet& x) {
1009
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
1010
+ typedef typename unpacket_traits<PacketI>::type ScalarI;
1011
+
1012
+ const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
1013
+
1014
+ // If the argument is smaller than this value, use a simpler argument reduction
1015
+ const double small_th = 15;
1016
+ // If the argument is bigger than this value, use the non-vectorized std version
1017
+ const double huge_th = 1e14;
1018
+
1019
+ const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI
1020
+ // Integer Packet constants
1021
+ const PacketI cst_one = pset1<PacketI>(ScalarI(1));
1022
+ // Constant for splitting
1023
+ const Packet cst_split = pset1<Packet>(1 << 24);
1024
+
1025
+ Packet x_abs = pabs(x);
1026
+
1027
+ // Scale x by 2/Pi
1028
+ PacketI q_int;
1029
+ Packet s;
1030
+
1031
+ // TODO Implement huge angle argument reduction
1032
+ if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
1033
+ Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
1034
+ Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
1035
+ q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
1036
+ Packet q_low = pcast<PacketI, Packet>(q_int);
1037
+ s = trig_reduce_medium_double(x_abs, q_high, q_low);
1038
+ } else {
1039
+ Packet qval_noround = pmul(x_abs, cst_2oPI);
1040
+ q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
1041
+ Packet q = pcast<PacketI, Packet>(q_int);
1042
+ s = trig_reduce_small_double(x_abs, q);
1043
+ }
1044
+
1045
+ // All the upcoming approximating polynomials have even exponents
1046
+ Packet ss = pmul(s, s);
1047
+
1048
+ // Padé approximant of cos(x)
1049
+ // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
1050
+ // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
1051
+ // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
1052
+ // MATLAB code to compute those coefficients:
1053
+ // syms x;
1054
+ // cosf = @(x) cos(x);
1055
+ // pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
1056
+ Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
1057
+ Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
1058
+ Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
1059
+ Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
1060
+ Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
1061
+ Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
1062
+ Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
1063
+ Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
1064
+ Packet scos = pdiv(sc4_num, sc4_denum);
1065
+
1066
+ // Padé approximant of sin(x)
1067
+ // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
1068
+ // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
1069
+ // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
1070
+ // MATLAB code to compute those coefficients:
1071
+ // syms x;
1072
+ // sinf = @(x) sin(x);
1073
+ // pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
1074
+ Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
1075
+ Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
1076
+ Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
1077
+ Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
1078
+ Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
1079
+ Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
1080
+ Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
1081
+ Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
1082
+ Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
1083
+
1084
+ Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
1085
+
1086
+ Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
1087
+ Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
1088
+ Packet sign_bit, sFinalRes;
1089
+ if (ComputeBoth) {
1090
+ Packet peven = peven_mask(x);
1091
+ sign_bit = pselect((s), sign_sin, sign_cos);
1092
+ sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
1093
+ } else {
1094
+ sign_bit = ComputeSine ? sign_sin : sign_cos;
1095
+ sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
1096
+ }
1097
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
1098
+ sFinalRes = pxor(sFinalRes, sign_bit);
1099
+
1100
+ // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
1101
+ // using std::sin and std::cos
1102
+ // TODO Remove it when huge angle argument reduction is implemented
1103
+ if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
1104
+ const int PacketSize = unpacket_traits<Packet>::size;
1105
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
1106
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
1107
+ pstoreu(x_cpy, x);
1108
+ pstoreu(sincos_vals, sFinalRes);
1109
+ for (int k = 0; k < PacketSize; ++k) {
1110
+ double val = x_cpy[k];
1111
+ if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
1112
+ if (ComputeBoth)
1113
+ sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
1114
+ else
1115
+ sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
1116
+ }
1117
+ }
1118
+ sFinalRes = ploadu<Packet>(sincos_vals);
1119
+ }
1120
+ return sFinalRes;
1121
+ }
1122
+
1123
+ template <typename Packet>
1124
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
1125
+ return psincos_double<true>(x);
1126
+ }
1127
+
1128
+ template <typename Packet>
1129
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
1130
+ return psincos_double<false>(x);
1131
+ }
1132
+
1133
+ // Generic implementation of acos(x).
1134
+ template <typename Packet>
1135
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
1136
+ typedef typename unpacket_traits<Packet>::type Scalar;
1137
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
1138
+
1139
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1140
+ const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
1141
+ const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
1142
+ const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
1143
+ const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
1144
+ const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
1145
+ const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
1146
+ const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
1147
+ const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
1148
+
1149
+ // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
1150
+ // function, by a 6'th order polynomial.
1151
+ // For x in [-1:0) we use that acos(-x) = pi - acos(x).
1152
+ const Packet neg_mask = psignbit(x_in);
1153
+ const Packet abs_x = pabs(x_in);
1154
+
1155
+ // Evaluate the polynomial using Horner's rule:
1156
+ // P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
1157
+ // We evaluate even and odd terms independently to increase
1158
+ // instruction level parallelism.
1159
+ Packet x2 = pmul(x_in, x_in);
1160
+ Packet p_even = pmadd(p6, x2, p4);
1161
+ Packet p_odd = pmadd(p5, x2, p3);
1162
+ p_even = pmadd(p_even, x2, p2);
1163
+ p_odd = pmadd(p_odd, x2, p1);
1164
+ p_even = pmadd(p_even, x2, p0);
1165
+ Packet p = pmadd(p_odd, abs_x, p_even);
1166
+
1167
+ // The polynomial approximates acos(x)/sqrt(1-x), so
1168
+ // multiply by sqrt(1-x) to get acos(x).
1169
+ // Conveniently returns NaN for arguments outside [-1:1].
1170
+ Packet denom = psqrt(psub(cst_one, abs_x));
1171
+ Packet result = pmul(denom, p);
1172
+ // Undo mapping for negative arguments.
1173
+ return pselect(neg_mask, psub(cst_pi, result), result);
1174
+ }
1175
+
1176
+ // Generic implementation of asin(x).
1177
+ template <typename Packet>
1178
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
1179
+ typedef typename unpacket_traits<Packet>::type Scalar;
1180
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
1181
+
1182
+ constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
1183
+
1184
+ const Packet cst_half = pset1<Packet>(0.5f);
1185
+ const Packet cst_one = pset1<Packet>(1.0f);
1186
+ const Packet cst_two = pset1<Packet>(2.0f);
1187
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
1188
+
1189
+ const Packet abs_x = pabs(x_in);
1190
+ const Packet sign_mask = pandnot(x_in, abs_x);
1191
+ const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
1192
+
1193
+ // For arguments |x| > 0.5, we map x back to [0:0.5] using
1194
+ // the transformation x_large = sqrt(0.5*(1-x)), and use the
1195
+ // identity
1196
+ // asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
1197
+
1198
+ const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
1199
+ const Packet large_mask = pcmp_lt(cst_half, abs_x);
1200
+ const Packet x = pselect(large_mask, x_large, abs_x);
1201
+ const Packet x2 = pmul(x, x);
1202
+
1203
+ // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
1204
+ // even terms only.
1205
+ constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
1206
+ 7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
1207
+ Packet p = ppolevl<Packet, 4>::run(x2, alpha);
1208
+ p = pmul(p, x);
1209
+
1210
+ const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
1211
+ p = pselect(large_mask, p_large, p);
1212
+ // Flip the sign for negative arguments.
1213
+ p = pxor(p, sign_mask);
1214
+ // Return NaN for arguments outside [-1:1].
1215
+ return por(invalid_mask, p);
1216
+ }
1217
+
1218
+ template <typename Scalar>
1219
+ struct patan_reduced {
1220
+ template <typename Packet>
1221
+ static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
1222
+ };
1223
+
1224
+ template <>
1225
+ template <typename Packet>
1226
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
1227
+ constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
1228
+ 3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
1229
+ 3.3004361289279920e-01};
1230
+
1231
+ constexpr double beta[] = {
1232
+ 2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
1233
+ 9.3705509168587852e-01, 3.3004361289279920e-01};
1234
+
1235
+ Packet x2 = pmul(x, x);
1236
+ Packet p = ppolevl<Packet, 6>::run(x2, alpha);
1237
+ Packet q = ppolevl<Packet, 6>::run(x2, beta);
1238
+ return pmul(x, pdiv(p, q));
1239
+ }
1240
+
1241
+ // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
1242
+ template <>
1243
+ template <typename Packet>
1244
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
1245
+ constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
1246
+
1247
+ constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
1248
+ 8.109951019287109375e-01f};
1249
+
1250
+ Packet x2 = pmul(x, x);
1251
+ Packet p = ppolevl<Packet, 2>::run(x2, alpha);
1252
+ Packet q = ppolevl<Packet, 3>::run(x2, beta);
1253
+ return pmul(x, pdiv(p, q));
1254
+ }
1255
+
1256
+ template <typename Packet>
1257
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
1258
+ typedef typename unpacket_traits<Packet>::type Scalar;
1259
+
1260
+ constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
1261
+
1262
+ const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
1263
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1264
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
1265
+
1266
+ // "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
1267
+ // "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
1268
+ // calculated using Rminimax.
1269
+
1270
+ const Packet abs_x = pabs(x_in);
1271
+ const Packet x_signmask = pand(x_in, cst_signmask);
1272
+ const Packet large_mask = pcmp_lt(cst_one, abs_x);
1273
+ const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
1274
+ const Packet p = patan_reduced<Scalar>::run(x);
1275
+ // Apply transformations according to the range reduction masks.
1276
+ Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
1277
+ // Return correct sign
1278
+ return pxor(result, x_signmask);
1279
+ }
1280
+
1281
+ /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
1282
+ Doesn't do anything fancy, just a 9/8-degree rational interpolant which
1283
+ is accurate up to a couple of ulps in the (approximate) range [-8, 8],
1284
+ outside of which tanh(x) = +/-1 in single precision. The input is clamped
1285
+ to the range [-c, c]. The value c is chosen as the smallest value where
1286
+ the approximation evaluates to exactly 1.
1287
+
1288
+ This implementation works on both scalars and packets.
1289
+ */
1290
+ template <typename T>
1291
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
1292
+ // Clamp the inputs to the range [-c, c] and set everything
1293
+ // outside that range to 1.0. The value c is chosen as the smallest
1294
+ // floating point argument such that the approximation is exactly 1.
1295
+ // This saves clamping the value at the end.
1296
+ #ifdef EIGEN_VECTORIZE_FMA
1297
+ const T plus_clamp = pset1<T>(8.01773357391357422f);
1298
+ const T minus_clamp = pset1<T>(-8.01773357391357422f);
1299
+ #else
1300
+ const T plus_clamp = pset1<T>(7.90738964080810547f);
1301
+ const T minus_clamp = pset1<T>(-7.90738964080810547f);
1302
+ #endif
1303
+ const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
1304
+
1305
+ // The following rational approximation was generated by rminimax
1306
+ // (https://gitlab.inria.fr/sfilip/rminimax) using the following
1307
+ // command:
1308
+ // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
1309
+ // --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
1310
+ // --output=tanhf.sollya --dispCoeff="dec"
1311
+
1312
+ // The monomial coefficients of the numerator polynomial (odd).
1313
+ constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
1314
+
1315
+ // The monomial coefficients of the denominator polynomial (even).
1316
+ constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
1317
+
1318
+ // Since the polynomials are odd/even, we need x^2.
1319
+ const T x2 = pmul(x, x);
1320
+ const T x3 = pmul(x2, x);
1321
+
1322
+ T p = ppolevl<T, 3>::run(x2, alpha);
1323
+ T q = ppolevl<T, 4>::run(x2, beta);
1324
+ // Take advantage of the fact that the constant term in p is 1 to compute
1325
+ // x*(x^2*p + 1) = x^3 * p + x.
1326
+ p = pmadd(x3, p, x);
1327
+
1328
+ // Divide the numerator by the denominator.
1329
+ return pdiv(p, q);
1330
+ }
1331
+
1332
+ /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
1333
+ This uses a 19/18-degree rational interpolant which
1334
+ is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
1335
+ outside of which tanh(x) = +/-1 in single precision. The input is clamped
1336
+ to the range [-c, c]. The value c is chosen as the smallest value where
1337
+ the approximation evaluates to exactly 1.
1338
+
1339
+ This implementation works on both scalars and packets.
1340
+ */
1341
+ template <typename T>
1342
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
1343
+ // Clamp the inputs to the range [-c, c] and set everything
1344
+ // outside that range to 1.0. The value c is chosen as the smallest
1345
+ // floating point argument such that the approximation is exactly 1.
1346
+ // This saves clamping the value at the end.
1347
+ #ifdef EIGEN_VECTORIZE_FMA
1348
+ const T plus_clamp = pset1<T>(17.6610191624600077);
1349
+ const T minus_clamp = pset1<T>(-17.6610191624600077);
1350
+ #else
1351
+ const T plus_clamp = pset1<T>(17.714196154005176);
1352
+ const T minus_clamp = pset1<T>(-17.714196154005176);
1353
+ #endif
1354
+ const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
1355
+
1356
+ // The following rational approximation was generated by rminimax
1357
+ // (https://gitlab.inria.fr/sfilip/rminimax) using the following
1358
+ // command:
1359
+ // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
1360
+ // --num="odd" --den="even" --type="[19,18]" --numF="[D]"
1361
+ // --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
1362
+
1363
+ // The monomial coefficients of the numerator polynomial (odd).
1364
+ constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
1365
+ 4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
1366
+ 9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
1367
+
1368
+ // The monomial coefficients of the denominator polynomial (even).
1369
+ constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
1370
+ 1.293019623712687916e-13, 1.123643448069621992e-10,
1371
+ 4.492975677839633985e-08, 8.785185266237658698e-06,
1372
+ 8.295161192716231542e-04, 3.437448108450402717e-02,
1373
+ 4.851805297361760360e-01, 1.0};
1374
+
1375
+ // Since the polynomials are odd/even, we need x^2.
1376
+ const T x2 = pmul(x, x);
1377
+ const T x3 = pmul(x2, x);
1378
+
1379
+ // Interleave the evaluation of the numerator polynomial p and
1380
+ // denominator polynomial q.
1381
+ T p = ppolevl<T, 8>::run(x2, alpha);
1382
+ T q = ppolevl<T, 9>::run(x2, beta);
1383
+ // Take advantage of the fact that the constant term in p is 1 to compute
1384
+ // x*(x^2*p + 1) = x^3 * p + x.
1385
+ p = pmadd(x3, p, x);
1386
+
1387
+ // Divide the numerator by the denominator.
1388
+ return pdiv(p, q);
1389
+ }
1390
+
1391
+ template <typename Packet>
1392
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
1393
+ typedef typename unpacket_traits<Packet>::type Scalar;
1394
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
1395
+
1396
+ // For |x| in [0:0.5] we use a polynomial approximation of the form
1397
+ // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
1398
+ constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
1399
+ 0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
1400
+ const Packet x2 = pmul(x, x);
1401
+ const Packet x3 = pmul(x, x2);
1402
+ Packet p = ppolevl<Packet, 4>::run(x2, alpha);
1403
+ p = pmadd(x3, p, x);
1404
+
1405
+ // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
1406
+ const Packet half = pset1<Packet>(0.5f);
1407
+ const Packet one = pset1<Packet>(1.0f);
1408
+ Packet r = pdiv(padd(one, x), psub(one, x));
1409
+ r = pmul(half, plog(r));
1410
+
1411
+ const Packet x_gt_half = pcmp_le(half, pabs(x));
1412
+ const Packet x_eq_one = pcmp_eq(one, pabs(x));
1413
+ const Packet x_gt_one = pcmp_lt(one, pabs(x));
1414
+ const Packet sign_mask = pset1<Packet>(-0.0f);
1415
+ const Packet x_sign = pand(sign_mask, x);
1416
+ const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
1417
+ return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
1418
+ }
1419
+
1420
+ template <typename Packet>
1421
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
1422
+ typedef typename unpacket_traits<Packet>::type Scalar;
1423
+ static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
1424
+ // For x in [-0.5:0.5] we use a rational approximation of the form
1425
+ // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
1426
+ constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
1427
+ -2.5949536095445679e-01, 1.2306328729812676e-01};
1428
+
1429
+ constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02, -4.2828141436397615e-01,
1430
+ 9.8733495886883648e-01, -1.0000000000000000e+00, 3.6918986189438030e-01};
1431
+
1432
+ const Packet x2 = pmul(x, x);
1433
+ const Packet x3 = pmul(x, x2);
1434
+ Packet p = ppolevl<Packet, 4>::run(x2, alpha);
1435
+ Packet q = ppolevl<Packet, 5>::run(x2, beta);
1436
+ Packet y_small = pmadd(x3, pdiv(p, q), x);
1437
+
1438
+ // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
1439
+ const Packet half = pset1<Packet>(0.5);
1440
+ const Packet one = pset1<Packet>(1.0);
1441
+ Packet y_large = pdiv(padd(one, x), psub(one, x));
1442
+ y_large = pmul(half, plog(y_large));
1443
+
1444
+ const Packet x_gt_half = pcmp_le(half, pabs(x));
1445
+ const Packet x_eq_one = pcmp_eq(one, pabs(x));
1446
+ const Packet x_gt_one = pcmp_lt(one, pabs(x));
1447
+ const Packet sign_mask = pset1<Packet>(-0.0);
1448
+ const Packet x_sign = pand(sign_mask, x);
1449
+ const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
1450
+ return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
1451
+ }
1452
+
1453
+ template <typename Packet>
1454
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
1455
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1456
+ // In the following we annotate the code for the case where the inputs
1457
+ // are a pair length-2 SIMD vectors representing a single pair of complex
1458
+ // numbers x = a + i*b, y = c + i*d.
1459
+ const RealPacket y_abs = pabs(y.v); // |c|, |d|
1460
+ const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
1461
+ const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
1462
+ const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|)
1463
+ // Compute scaled denominator.
1464
+ const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
1465
+ const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
1466
+ Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c
1467
+ // Divide elementwise by denom.
1468
+ result_scaled = Packet(pdiv(result_scaled.v, denom));
1469
+ // Rescale result
1470
+ return Packet(pdiv(result_scaled.v, y_max));
1471
+ }
1472
+
1473
+ template <typename Packet>
1474
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
1475
+ typedef typename unpacket_traits<Packet>::type Scalar;
1476
+ typedef typename Scalar::value_type RealScalar;
1477
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1478
+
1479
+ RealPacket real_mask_rp = peven_mask(x.v);
1480
+ Packet real_mask(real_mask_rp);
1481
+
1482
+ // Real part
1483
+ RealPacket x_flip = pcplxflip(x).v; // b, a
1484
+ Packet x_norm = phypot_complex(x); // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
1485
+ RealPacket xlogr = plog(x_norm.v); // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
1486
+
1487
+ // Imag part
1488
+ RealPacket ximg = patan2(x.v, x_flip); // atan2(a, b), atan2(b, a)
1489
+
1490
+ const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
1491
+ RealPacket x_abs = pabs(x.v);
1492
+ RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
1493
+ RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
1494
+ RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
1495
+ RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
1496
+
1497
+ Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg)); // log(sqrt(a^2 + b^2)), atan2(b, a)
1498
+ return xres;
1499
+ }
1500
+
1501
+ template <typename Packet>
1502
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
1503
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1504
+ typedef typename unpacket_traits<Packet>::type Scalar;
1505
+ typedef typename Scalar::value_type RealScalar;
1506
+ const RealPacket even_mask = peven_mask(a.v);
1507
+ const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
1508
+
1509
+ // Let a = x + iy.
1510
+ // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
1511
+
1512
+ // exp(x):
1513
+ RealPacket x = pand(a.v, even_mask);
1514
+ x = por(x, pcplxflip(Packet(x)).v);
1515
+ RealPacket expx = pexp(x); // exp(x);
1516
+
1517
+ // cis(y):
1518
+ RealPacket y = pand(odd_mask, a.v);
1519
+ y = por(y, pcplxflip(Packet(y)).v);
1520
+ RealPacket cisy = psincos_float<false, RealPacket, true>(y);
1521
+ cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y)
1522
+
1523
+ const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
1524
+ const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
1525
+
1526
+ // If x is -inf, we know that cossin(y) is bounded,
1527
+ // so the result is (0, +/-0), where the sign of the imaginary part comes
1528
+ // from the sign of cossin(y).
1529
+ RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
1530
+ cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
1531
+
1532
+ // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
1533
+ // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
1534
+ RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
1535
+ cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
1536
+ Packet result = Packet(pmul(expx, cisy));
1537
+
1538
+ // If y is +/- 0, the input is real, so take the real result for consistency.
1539
+ result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
1540
+
1541
+ return result;
1542
+ }
1543
+
1544
+ template <typename Packet>
1545
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
1546
+ typedef typename unpacket_traits<Packet>::type Scalar;
1547
+ typedef typename Scalar::value_type RealScalar;
1548
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1549
+
1550
+ // Computes the principal sqrt of the complex numbers in the input.
1551
+ //
1552
+ // For example, for packets containing 2 complex numbers stored in interleaved format
1553
+ // a = [a0, a1] = [x0, y0, x1, y1],
1554
+ // where x0 = real(a0), y0 = imag(a0) etc., this function returns
1555
+ // b = [b0, b1] = [u0, v0, u1, v1],
1556
+ // such that b0^2 = a0, b1^2 = a1.
1557
+ //
1558
+ // To derive the formula for the complex square roots, let's consider the equation for
1559
+ // a single complex square root of the number x + i*y. We want to find real numbers
1560
+ // u and v such that
1561
+ // (u + i*v)^2 = x + i*y <=>
1562
+ // u^2 - v^2 + i*2*u*v = x + i*v.
1563
+ // By equating the real and imaginary parts we get:
1564
+ // u^2 - v^2 = x
1565
+ // 2*u*v = y.
1566
+ //
1567
+ // For x >= 0, this has the numerically stable solution
1568
+ // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
1569
+ // v = 0.5 * (y / u)
1570
+ // and for x < 0,
1571
+ // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
1572
+ // u = 0.5 * (y / v)
1573
+ //
1574
+ // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
1575
+ // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
1576
+
1577
+ // In the following, without lack of generality, we have annotated the code, assuming
1578
+ // that the input is a packet of 2 complex numbers.
1579
+ //
1580
+ // Step 1. Compute l = [l0, l0, l1, l1], where
1581
+ // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2)
1582
+ // To avoid over- and underflow, we use the stable formula for each hypotenuse
1583
+ // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
1584
+ // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
1585
+
1586
+ RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
1587
+ RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
1588
+ RealPacket a_max = pmax(a_abs, a_abs_flip);
1589
+ RealPacket a_min = pmin(a_abs, a_abs_flip);
1590
+ RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
1591
+ RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
1592
+ RealPacket r = pdiv(a_min, a_max);
1593
+ const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
1594
+ RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
1595
+ // Set l to a_max if a_min is zero.
1596
+ l = pselect(a_min_zero_mask, a_max, l);
1597
+
1598
+ // Step 2. Compute [rho0, *, rho1, *], where
1599
+ // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|))
1600
+ // We don't care about the imaginary parts computed here. They will be overwritten later.
1601
+ const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
1602
+ Packet rho;
1603
+ rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
1604
+
1605
+ // Step 3. Compute [rho0, eta0, rho1, eta1], where
1606
+ // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
1607
+ // set eta = 0 of input is 0 + i0.
1608
+ RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
1609
+ RealPacket real_mask = peven_mask(a.v);
1610
+ Packet positive_real_result;
1611
+ // Compute result for inputs with positive real part.
1612
+ positive_real_result.v = pselect(real_mask, rho.v, eta);
1613
+
1614
+ // Step 4. Compute solution for inputs with negative real part:
1615
+ // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
1616
+ const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
1617
+ RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
1618
+ Packet negative_real_result;
1619
+ // Notice that rho is positive, so taking it's absolute value is a noop.
1620
+ negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
1621
+
1622
+ // Step 5. Select solution branch based on the sign of the real parts.
1623
+ Packet negative_real_mask;
1624
+ negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
1625
+ negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
1626
+ Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
1627
+
1628
+ // Step 6. Handle special cases for infinities:
1629
+ // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
1630
+ // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
1631
+ // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
1632
+ // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
1633
+ const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
1634
+ Packet is_inf;
1635
+ is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
1636
+ Packet is_real_inf;
1637
+ is_real_inf.v = pand(is_inf.v, real_mask);
1638
+ is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
1639
+ // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
1640
+ Packet real_inf_result;
1641
+ real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
1642
+ real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
1643
+ // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
1644
+ Packet is_imag_inf;
1645
+ is_imag_inf.v = pandnot(is_inf.v, real_mask);
1646
+ is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
1647
+ Packet imag_inf_result;
1648
+ imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
1649
+ // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
1650
+ Packet result_is_nan = pisnan(result);
1651
+ result = por(result_is_nan, result);
1652
+
1653
+ return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
1654
+ }
1655
+
1656
+ // \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
1657
+ // Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
1658
+ template <typename Packet>
1659
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
1660
+ typedef typename unpacket_traits<Packet>::type Scalar;
1661
+ typedef typename Scalar::value_type RealScalar;
1662
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1663
+
1664
+ const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
1665
+ const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
1666
+ const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
1667
+ const RealPacket evenmask = peven_mask(a.v);
1668
+
1669
+ RealPacket a_abs = pabs(a.v);
1670
+ RealPacket a_flip = pcplxflip(Packet(a_abs)).v; // |b|, |a|
1671
+ RealPacket a_all = pselect(evenmask, a_abs, a_flip); // |a|, |a|
1672
+ RealPacket b_all = pselect(evenmask, a_flip, a_abs); // |b|, |b|
1673
+
1674
+ RealPacket a2 = pmul(a.v, a.v); // |a^2, b^2|
1675
+ RealPacket a2_flip = pcplxflip(Packet(a2)).v; // |b^2, a^2|
1676
+ RealPacket h = psqrt(padd(a2, a2_flip)); // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
1677
+ RealPacket h_sq = pmul(h, h); // |a^2 + b^2, a^2 + b^2|
1678
+ RealPacket a_sq = pselect(evenmask, a2, a2_flip); // |a^2, a^2|
1679
+ RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
1680
+ RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
1681
+ RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
1682
+ h = psub(h, pdiv(x, pmul(cst_two_rp, h))); // |h - x/(2*h), h - x/(2*h)|
1683
+
1684
+ // handle zero-case
1685
+ RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
1686
+
1687
+ h = pandnot(h, iszero); // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
1688
+ return Packet(h); // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
1689
+ }
1690
+
1691
+ template <typename Packet>
1692
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1693
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1694
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1695
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1696
+ using Scalar = typename unpacket_traits<Packet>::type;
1697
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1698
+ const Packet cst_zero = pzero(a);
1699
+
1700
+ const Packet abs_a = pabs(a);
1701
+ const Packet sign_mask = pandnot(a, abs_a);
1702
+ const Packet nonzero_mask = pcmp_lt(cst_zero, abs_a);
1703
+
1704
+ return pselect(nonzero_mask, por(sign_mask, cst_one), abs_a);
1705
+ }
1706
+ };
1707
+
1708
+ template <typename Packet>
1709
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1710
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1711
+ NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
1712
+ NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1713
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1714
+ using Scalar = typename unpacket_traits<Packet>::type;
1715
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1716
+ const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
1717
+ const Packet cst_zero = pzero(a);
1718
+
1719
+ const Packet positive_mask = pcmp_lt(cst_zero, a);
1720
+ const Packet positive = pand(positive_mask, cst_one);
1721
+ const Packet negative_mask = pcmp_lt(a, cst_zero);
1722
+ const Packet negative = pand(negative_mask, cst_minus_one);
1723
+
1724
+ return por(positive, negative);
1725
+ }
1726
+ };
1727
+
1728
+ template <typename Packet>
1729
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1730
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1731
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
1732
+ NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1733
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1734
+ using Scalar = typename unpacket_traits<Packet>::type;
1735
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1736
+ const Packet cst_zero = pzero(a);
1737
+
1738
+ const Packet zero_mask = pcmp_eq(cst_zero, a);
1739
+ return pandnot(cst_one, zero_mask);
1740
+ }
1741
+ };
1742
+
1743
+ // \internal \returns the the sign of a complex number z, defined as z / abs(z).
1744
+ template <typename Packet>
1745
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1746
+ NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1747
+ unpacket_traits<Packet>::vectorizable>> {
1748
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1749
+ typedef typename unpacket_traits<Packet>::type Scalar;
1750
+ typedef typename Scalar::value_type RealScalar;
1751
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1752
+
1753
+ // Step 1. Compute (for each element z = x + i*y in a)
1754
+ // l = abs(z) = sqrt(x^2 + y^2).
1755
+ // To avoid over- and underflow, we use the stable formula for each hypotenuse
1756
+ // l = (zmin == 0 ? zmax : zmax * sqrt(1 + (zmin/zmax)**2)),
1757
+ // where zmax = max(|x|, |y|), zmin = min(|x|, |y|),
1758
+ RealPacket a_abs = pabs(a.v);
1759
+ RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;
1760
+ RealPacket a_max = pmax(a_abs, a_abs_flip);
1761
+ RealPacket a_min = pmin(a_abs, a_abs_flip);
1762
+ RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
1763
+ RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
1764
+ RealPacket r = pdiv(a_min, a_max);
1765
+ const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
1766
+ RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
1767
+ // Set l to a_max if a_min is zero, since the roundtrip sqrt(a_max^2) may be
1768
+ // lossy.
1769
+ l = pselect(a_min_zero_mask, a_max, l);
1770
+ // Step 2 compute a / abs(a).
1771
+ RealPacket sign_as_real = pandnot(pdiv(a.v, l), a_max_zero_mask);
1772
+ Packet sign;
1773
+ sign.v = sign_as_real;
1774
+ return sign;
1775
+ }
1776
+ };
1777
+
1778
+ // TODO(rmlarsen): The following set of utilities for double word arithmetic
1779
+ // should perhaps be refactored as a separate file, since it would be generally
1780
+ // useful for special function implementation etc. Writing the algorithms in
1781
+ // terms if a double word type would also make the code more readable.
1782
+
1783
+ // This function splits x into the nearest integer n and fractional part r,
1784
+ // such that x = n + r holds exactly.
1785
+ template <typename Packet>
1786
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
1787
+ n = pround(x);
1788
+ r = psub(x, n);
1789
+ }
1790
+
1791
+ // This function computes the sum {s, r}, such that x + y = s_hi + s_lo
1792
+ // holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
1793
+ template <typename Packet>
1794
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
1795
+ s_hi = padd(x, y);
1796
+ const Packet t = psub(s_hi, x);
1797
+ s_lo = psub(y, t);
1798
+ }
1799
+
1800
+ #ifdef EIGEN_VECTORIZE_FMA
1801
+ // This function implements the extended precision product of
1802
+ // a pair of floating point numbers. Given {x, y}, it computes the pair
1803
+ // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
1804
+ // p_hi = fl(x * y).
1805
+ template <typename Packet>
1806
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
1807
+ p_hi = pmul(x, y);
1808
+ p_lo = pmsub(x, y, p_hi);
1809
+ }
1810
+
1811
+ // A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
1812
+ // x * y = xy + p_lo holds exactly.
1813
+ template <typename Packet>
1814
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
1815
+ return pmsub(x, y, xy);
1816
+ }
1817
+
1818
+ #else
1819
+
1820
+ // This function implements the Veltkamp splitting. Given a floating point
1821
+ // number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
1822
+ // exactly and that half of the significant of x fits in x_hi.
1823
+ // This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
1824
+ // 3rd edition, Birkh\"auser, 2016.
1825
+ template <typename Packet>
1826
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
1827
+ typedef typename unpacket_traits<Packet>::type Scalar;
1828
+ constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
1829
+ const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr.
1830
+ const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
1831
+ Packet rho = psub(x, gamma);
1832
+ x_hi = padd(rho, gamma);
1833
+ x_lo = psub(x, x_hi);
1834
+ }
1835
+
1836
+ // This function implements Dekker's algorithm for products x * y.
1837
+ // Given floating point numbers {x, y} computes the pair
1838
+ // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
1839
+ // p_hi = fl(x * y).
1840
+ template <typename Packet>
1841
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
1842
+ Packet x_hi, x_lo, y_hi, y_lo;
1843
+ veltkamp_splitting(x, x_hi, x_lo);
1844
+ veltkamp_splitting(y, y_hi, y_lo);
1845
+
1846
+ p_hi = pmul(x, y);
1847
+ p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
1848
+ p_lo = pmadd(x_hi, y_lo, p_lo);
1849
+ p_lo = pmadd(x_lo, y_hi, p_lo);
1850
+ p_lo = pmadd(x_lo, y_lo, p_lo);
1851
+ }
1852
+
1853
+ // A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
1854
+ // x * y = xy + p_lo holds exactly.
1855
+ template <typename Packet>
1856
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
1857
+ Packet x_hi, x_lo, y_hi, y_lo;
1858
+ veltkamp_splitting(x, x_hi, x_lo);
1859
+ veltkamp_splitting(y, y_hi, y_lo);
1860
+
1861
+ Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
1862
+ p_lo = pmadd(x_hi, y_lo, p_lo);
1863
+ p_lo = pmadd(x_lo, y_hi, p_lo);
1864
+ p_lo = pmadd(x_lo, y_lo, p_lo);
1865
+ return p_lo;
1866
+ }
1867
+
1868
+ #endif // EIGEN_VECTORIZE_FMA
1869
+
1870
+ // This function implements Dekker's algorithm for the addition
1871
+ // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
1872
+ // It returns the result as a pair {s_hi, s_lo} such that
1873
+ // x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
1874
+ // This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
1875
+ // 3rd edition, Birkh\"auser, 2016.
1876
+ template <typename Packet>
1877
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
1878
+ const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
1879
+ const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
1880
+ Packet r_hi_1, r_lo_1;
1881
+ fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
1882
+ Packet r_hi_2, r_lo_2;
1883
+ fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
1884
+ const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
1885
+
1886
+ const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
1887
+ const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
1888
+ const Packet s = pselect(x_greater_mask, s1, s2);
1889
+
1890
+ fast_twosum(r_hi, s, s_hi, s_lo);
1891
+ }
1892
+
1893
+ // This is a version of twosum for double word numbers,
1894
+ // which assumes that |x_hi| >= |y_hi|.
1895
+ template <typename Packet>
1896
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
1897
+ const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
1898
+ Packet r_hi, r_lo;
1899
+ fast_twosum(x_hi, y_hi, r_hi, r_lo);
1900
+ const Packet s = padd(padd(y_lo, r_lo), x_lo);
1901
+ fast_twosum(r_hi, s, s_hi, s_lo);
1902
+ }
1903
+
1904
+ // This is a version of twosum for adding a floating point number x to
1905
+ // double word number {y_hi, y_lo} number, with the assumption
1906
+ // that |x| >= |y_hi|.
1907
+ template <typename Packet>
1908
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
1909
+ Packet& s_hi, Packet& s_lo) {
1910
+ Packet r_hi, r_lo;
1911
+ fast_twosum(x, y_hi, r_hi, r_lo);
1912
+ const Packet s = padd(y_lo, r_lo);
1913
+ fast_twosum(r_hi, s, s_hi, s_lo);
1914
+ }
1915
+
1916
+ // This function implements the multiplication of a double word
1917
+ // number represented by {x_hi, x_lo} by a floating point number y.
1918
+ // It returns the result as a pair {p_hi, p_lo} such that
1919
+ // (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
1920
+ // of less than 2*2^{-2p}, where p is the number of significand bit
1921
+ // in the floating point type.
1922
+ // This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
1923
+ // 3rd edition, Birkh\"auser, 2016.
1924
+ template <typename Packet>
1925
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1926
+ Packet& p_hi, Packet& p_lo) {
1927
+ Packet c_hi, c_lo1;
1928
+ twoprod(x_hi, y, c_hi, c_lo1);
1929
+ const Packet c_lo2 = pmul(x_lo, y);
1930
+ Packet t_hi, t_lo1;
1931
+ fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
1932
+ const Packet t_lo2 = padd(t_lo1, c_lo1);
1933
+ fast_twosum(t_hi, t_lo2, p_hi, p_lo);
1934
+ }
1935
+
1936
+ // This function implements the multiplication of two double word
1937
+ // numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
1938
+ // It returns the result as a pair {p_hi, p_lo} such that
1939
+ // (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
1940
+ // of less than 2*2^{-2p}, where p is the number of significand bit
1941
+ // in the floating point type.
1942
+ template <typename Packet>
1943
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
1944
+ const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
1945
+ Packet p_hi_hi, p_hi_lo;
1946
+ twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
1947
+ Packet p_lo_hi, p_lo_lo;
1948
+ twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
1949
+ fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
1950
+ }
1951
+
1952
+ // This function implements the division of double word {x_hi, x_lo}
1953
+ // by float y. This is Algorithm 15 from "Tight and rigorous error bounds
1954
+ // for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
1955
+ // 2017. https://hal.archives-ouvertes.fr/hal-01351529
1956
+ template <typename Packet>
1957
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1958
+ Packet& z_hi, Packet& z_lo) {
1959
+ const Packet t_hi = pdiv(x_hi, y);
1960
+ Packet pi_hi, pi_lo;
1961
+ twoprod(t_hi, y, pi_hi, pi_lo);
1962
+ const Packet delta_hi = psub(x_hi, pi_hi);
1963
+ const Packet delta_t = psub(delta_hi, pi_lo);
1964
+ const Packet delta = padd(delta_t, x_lo);
1965
+ const Packet t_lo = pdiv(delta, y);
1966
+ fast_twosum(t_hi, t_lo, z_hi, z_lo);
1967
+ }
1968
+
1969
+ // This function computes log2(x) and returns the result as a double word.
1970
+ template <typename Scalar>
1971
+ struct accurate_log2 {
1972
+ template <typename Packet>
1973
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
1974
+ log2_x_hi = plog2(x);
1975
+ log2_x_lo = pzero(x);
1976
+ }
1977
+ };
1978
+
1979
+ // This specialization uses a more accurate algorithm to compute log2(x) for
1980
+ // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
1981
+ // This additional accuracy is needed to counter the error-magnification
1982
+ // inherent in multiplying by a potentially large exponent in pow(x,y).
1983
+ // The minimax polynomial used was calculated using the Rminimax tool,
1984
+ // see https://gitlab.inria.fr/sfilip/rminimax.
1985
+ // Command line:
1986
+ // $ ratapprox --function="log2(1+x)/x" --dom='[-0.2929,0.41422]'
1987
+ // --type=[10,0]
1988
+ // --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
1989
+ //
1990
+ // The resulting implementation of pow(x,y) is accurate to 3 ulps.
1991
+ template <>
1992
+ struct accurate_log2<float> {
1993
+ template <typename Packet>
1994
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
1995
+ // Split the two lowest order constant coefficient into double-word representation.
1996
+ constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
1997
+ constexpr float kC0_hi = static_cast<float>(kC0);
1998
+ constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
1999
+ const Packet c0_hi = pset1<Packet>(kC0_hi);
2000
+ const Packet c0_lo = pset1<Packet>(kC0_lo);
2001
+
2002
+ constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
2003
+ constexpr float kC1_hi = static_cast<float>(kC1);
2004
+ constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
2005
+ const Packet c1_hi = pset1<Packet>(kC1_hi);
2006
+ const Packet c1_lo = pset1<Packet>(kC1_lo);
2007
+
2008
+ constexpr float c[] = {
2009
+ 9.7010828554630279541015625e-02, -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
2010
+ -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01, -2.4046677350997924804687500e-01,
2011
+ 2.8857553005218505859375000e-01, -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
2012
+
2013
+ // Evaluate the higher order terms in the polynomial using
2014
+ // standard arithmetic.
2015
+ const Packet one = pset1<Packet>(1.0f);
2016
+ const Packet x = psub(z, one);
2017
+ Packet p = ppolevl<Packet, 8>::run(x, c);
2018
+ // Evaluate the final two step in Horner's rule using double-word
2019
+ // arithmetic.
2020
+ Packet p_hi, p_lo;
2021
+ twoprod(x, p, p_hi, p_lo);
2022
+ fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
2023
+ twoprod(p_hi, p_lo, x, p_hi, p_lo);
2024
+ fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
2025
+ // Multiply by x to recover log2(z).
2026
+ twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
2027
+ }
2028
+ };
2029
+
2030
+ // This specialization uses a more accurate algorithm to compute log2(x) for
2031
+ // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
2032
+ // This additional accuracy is needed to counter the error-magnification
2033
+ // inherent in multiplying by a potentially large exponent in pow(x,y).
2034
+ // The minimax polynomial used was calculated using the Sollya tool.
2035
+ // See sollya.org.
2036
+
2037
+ template <>
2038
+ struct accurate_log2<double> {
2039
+ template <typename Packet>
2040
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
2041
+ // We use a transformation of variables:
2042
+ // r = c * (x-1) / (x+1),
2043
+ // such that
2044
+ // log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
2045
+ // The function f(r) can be approximated well using an odd polynomial
2046
+ // of the form
2047
+ // P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
2048
+ // For the implementation of log2<double> here, Q is of degree 6 with
2049
+ // coefficient represented in working precision (double), while C is a
2050
+ // constant represented in extra precision as a double word to achieve
2051
+ // full accuracy.
2052
+ //
2053
+ // The polynomial coefficients were computed by the Sollya script:
2054
+ //
2055
+ // c = 2 / log(2);
2056
+ // trans = c * (x-1)/(x+1);
2057
+ // itrans = (1+x/c)/(1-x/c);
2058
+ // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
2059
+ // print(interval);
2060
+ // f = log2(itrans(x));
2061
+ // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
2062
+ const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
2063
+ const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
2064
+ const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
2065
+ const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
2066
+ const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
2067
+ const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
2068
+ const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
2069
+ const Packet C_hi = pset1<Packet>(0.0400377511598501157);
2070
+ const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
2071
+ const Packet one = pset1<Packet>(1.0);
2072
+
2073
+ const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
2074
+ const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
2075
+ // c * (x - 1)
2076
+ Packet t_hi, t_lo;
2077
+ // t = c * (x-1)
2078
+ twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
2079
+ // r = c * (x-1) / (x+1),
2080
+ Packet r_hi, r_lo;
2081
+ doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
2082
+
2083
+ // r2 = r * r
2084
+ Packet r2_hi, r2_lo;
2085
+ twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
2086
+ // r4 = r2 * r2
2087
+ Packet r4_hi, r4_lo;
2088
+ twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
2089
+
2090
+ // Evaluate Q(r^2) in working precision. We evaluate it in two parts
2091
+ // (even and odd in r^2) to improve instruction level parallelism.
2092
+ Packet q_even = pmadd(q12, r4_hi, q8);
2093
+ Packet q_odd = pmadd(q10, r4_hi, q6);
2094
+ q_even = pmadd(q_even, r4_hi, q4);
2095
+ q_odd = pmadd(q_odd, r4_hi, q2);
2096
+ q_even = pmadd(q_even, r4_hi, q0);
2097
+ Packet q = pmadd(q_odd, r2_hi, q_even);
2098
+
2099
+ // Now evaluate the low order terms of P(x) in double word precision.
2100
+ // In the following, due to the increasing magnitude of the coefficients
2101
+ // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
2102
+ // of the slower twosum.
2103
+ // Q(r^2) * r^2
2104
+ Packet p_hi, p_lo;
2105
+ twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
2106
+ // Q(r^2) * r^2 + C
2107
+ Packet p1_hi, p1_lo;
2108
+ fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
2109
+ // (Q(r^2) * r^2 + C) * r^2
2110
+ Packet p2_hi, p2_lo;
2111
+ twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
2112
+ // ((Q(r^2) * r^2 + C) * r^2 + 1)
2113
+ Packet p3_hi, p3_lo;
2114
+ fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
2115
+
2116
+ // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
2117
+ twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
2118
+ }
2119
+ };
2120
+
2121
+ // This function implements the non-trivial case of pow(x,y) where x is
2122
+ // positive and y is (possibly) non-integer.
2123
+ // Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
2124
+ // TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
2125
+ // easier to specialize or turn off for specific types and/or backends.x
2126
+ template <typename Packet>
2127
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
2128
+ typedef typename unpacket_traits<Packet>::type Scalar;
2129
+ // Split x into exponent e_x and mantissa m_x.
2130
+ Packet e_x;
2131
+ Packet m_x = pfrexp(x, e_x);
2132
+
2133
+ // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
2134
+ constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
2135
+ const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
2136
+ m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
2137
+ e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
2138
+
2139
+ // Compute log2(m_x) with 6 extra bits of accuracy.
2140
+ Packet rx_hi, rx_lo;
2141
+ accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
2142
+
2143
+ // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
2144
+ // precision using double word arithmetic.
2145
+ Packet f1_hi, f1_lo, f2_hi, f2_lo;
2146
+ twoprod(e_x, y, f1_hi, f1_lo);
2147
+ twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
2148
+ // Sum the two terms in f using double word arithmetic. We know
2149
+ // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
2150
+ // This means that we can use fast_twosum(f1,f2).
2151
+ // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
2152
+ // accuracy by violating the assumption of fast_twosum, because
2153
+ // it's a no-op.
2154
+ Packet f_hi, f_lo;
2155
+ fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
2156
+
2157
+ // Split f into integer and fractional parts.
2158
+ Packet n_z, r_z;
2159
+ absolute_split(f_hi, n_z, r_z);
2160
+ r_z = padd(r_z, f_lo);
2161
+ Packet n_r;
2162
+ absolute_split(r_z, n_r, r_z);
2163
+ n_z = padd(n_z, n_r);
2164
+
2165
+ // We now have an accurate split of f = n_z + r_z and can compute
2166
+ // x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
2167
+ // Multiplication by the second factor can be done exactly using pldexp(), since
2168
+ // it is an integer power of 2.
2169
+ const Packet e_r = generic_exp2(r_z);
2170
+
2171
+ // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
2172
+ // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
2173
+ constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
2174
+ const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
2175
+ if (predux_any(pldexp_fast_unsafe)) {
2176
+ return pldexp(e_r, n_z);
2177
+ }
2178
+ return pldexp_fast(e_r, n_z);
2179
+ }
2180
+
2181
+ // Generic implementation of pow(x,y).
2182
+ template <typename Packet>
2183
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
2184
+ const Packet& x, const Packet& y) {
2185
+ typedef typename unpacket_traits<Packet>::type Scalar;
2186
+
2187
+ const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
2188
+ const Packet cst_zero = pset1<Packet>(Scalar(0));
2189
+ const Packet cst_one = pset1<Packet>(Scalar(1));
2190
+ const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
2191
+
2192
+ const Packet x_abs = pabs(x);
2193
+ Packet pow = generic_pow_impl(x_abs, y);
2194
+
2195
+ // In the following we enforce the special case handling prescribed in
2196
+ // https://en.cppreference.com/w/cpp/numeric/math/pow.
2197
+
2198
+ // Predicates for sign and magnitude of x.
2199
+ const Packet x_is_negative = pcmp_lt(x, cst_zero);
2200
+ const Packet x_is_zero = pcmp_eq(x, cst_zero);
2201
+ const Packet x_is_one = pcmp_eq(x, cst_one);
2202
+ const Packet x_has_signbit = psignbit(x);
2203
+ const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
2204
+ const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
2205
+
2206
+ // Predicates for sign and magnitude of y.
2207
+ const Packet y_abs = pabs(y);
2208
+ const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
2209
+ const Packet y_is_negative = pcmp_lt(y, cst_zero);
2210
+ const Packet y_is_zero = pcmp_eq(y, cst_zero);
2211
+ const Packet y_is_one = pcmp_eq(y, cst_one);
2212
+ // Predicates for whether y is integer and odd/even.
2213
+ const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
2214
+ const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
2215
+ const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
2216
+ const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
2217
+ // Smallest exponent for which (1 + epsilon) overflows to infinity.
2218
+ constexpr Scalar huge_exponent =
2219
+ (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
2220
+ const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
2221
+
2222
+ // * pow(base, exp) returns NaN if base is finite and negative
2223
+ // and exp is finite and non-integer.
2224
+ pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
2225
+
2226
+ // * pow(±0, exp), where exp is negative, finite, and is an even integer or
2227
+ // a non-integer, returns +∞
2228
+ // * pow(±0, exp), where exp is positive non-integer or a positive even
2229
+ // integer, returns +0
2230
+ // * pow(+0, exp), where exp is a negative odd integer, returns +∞
2231
+ // * pow(-0, exp), where exp is a negative odd integer, returns -∞
2232
+ // * pow(+0, exp), where exp is a positive odd integer, returns +0
2233
+ // * pow(-0, exp), where exp is a positive odd integer, returns -0
2234
+ // Sign is flipped by the rule below.
2235
+ pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
2236
+
2237
+ // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
2238
+ // and exp is an odd integer exponent.
2239
+ pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
2240
+
2241
+ // * pow(base, -∞) returns +∞ for any |base|<1
2242
+ // * pow(base, -∞) returns +0 for any |base|>1
2243
+ // * pow(base, +∞) returns +0 for any |base|<1
2244
+ // * pow(base, +∞) returns +∞ for any |base|>1
2245
+ // * pow(±0, -∞) returns +∞
2246
+ // * pow(-1, +-∞) = 1
2247
+ Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
2248
+ inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
2249
+ pow = pselect(y_abs_is_huge, inf_y_val, pow);
2250
+
2251
+ // * pow(+∞, exp) returns +0 for any negative exp
2252
+ // * pow(+∞, exp) returns +∞ for any positive exp
2253
+ // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
2254
+ // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
2255
+ // even integer.
2256
+ // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
2257
+ // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
2258
+ // even integer.
2259
+ auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
2260
+ auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
2261
+ pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
2262
+
2263
+ // All cases of NaN inputs return NaN, except the two below.
2264
+ pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
2265
+
2266
+ // * pow(base, 1) returns base.
2267
+ // * pow(base, +/-0) returns 1, regardless of base, even NaN.
2268
+ // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
2269
+ pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
2270
+
2271
+ return pow;
2272
+ }
2273
+
2274
+ template <typename Scalar>
2275
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
2276
+ const Scalar& x, const Scalar& y) {
2277
+ return numext::pow(x, y);
2278
+ }
2279
+
2280
+ namespace unary_pow {
2281
+
2282
+ template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
2283
+ struct exponent_helper {
2284
+ using safe_abs_type = ScalarExponent;
2285
+ static constexpr ScalarExponent one_half = ScalarExponent(0.5);
2286
+ // these routines assume that exp is an integer stored as a floating point type
2287
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
2288
+ return numext::abs(exp);
2289
+ }
2290
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
2291
+ eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
2292
+ ScalarExponent exp_div_2 = exp * one_half;
2293
+ ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
2294
+ return exp_div_2 != floor_exp_div_2;
2295
+ }
2296
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
2297
+ ScalarExponent exp_div_2 = exp * one_half;
2298
+ return numext::floor(exp_div_2);
2299
+ }
2300
+ };
2301
+
2302
+ template <typename ScalarExponent>
2303
+ struct exponent_helper<ScalarExponent, true> {
2304
+ // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
2305
+ // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
2306
+ using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
2307
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
2308
+ ScalarExponent mask = numext::signbit(exp);
2309
+ safe_abs_type result = safe_abs_type(exp ^ mask);
2310
+ return result + safe_abs_type(ScalarExponent(1) & mask);
2311
+ }
2312
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
2313
+ return exp % safe_abs_type(2) != safe_abs_type(0);
2314
+ }
2315
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
2316
+ return exp >> safe_abs_type(1);
2317
+ }
2318
+ };
2319
+
2320
+ template <typename Packet, typename ScalarExponent,
2321
+ bool ReciprocateIfExponentIsNegative =
2322
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
2323
+ struct reciprocate {
2324
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2325
+ using Scalar = typename unpacket_traits<Packet>::type;
2326
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2327
+ return exponent < 0 ? pdiv(cst_pos_one, x) : x;
2328
+ }
2329
+ };
2330
+
2331
+ template <typename Packet, typename ScalarExponent>
2332
+ struct reciprocate<Packet, ScalarExponent, false> {
2333
+ // pdiv not defined, nor necessary for integer base types
2334
+ // if the exponent is unsigned, then the exponent cannot be negative
2335
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
2336
+ };
2337
+
2338
+ template <typename Packet, typename ScalarExponent>
2339
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
2340
+ using Scalar = typename unpacket_traits<Packet>::type;
2341
+ using ExponentHelper = exponent_helper<ScalarExponent>;
2342
+ using AbsExponentType = typename ExponentHelper::safe_abs_type;
2343
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2344
+ if (exponent == ScalarExponent(0)) return cst_pos_one;
2345
+
2346
+ Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
2347
+ Packet y = cst_pos_one;
2348
+ AbsExponentType m = ExponentHelper::safe_abs(exponent);
2349
+
2350
+ while (m > 1) {
2351
+ bool odd = ExponentHelper::is_odd(m);
2352
+ if (odd) y = pmul(y, result);
2353
+ result = pmul(result, result);
2354
+ m = ExponentHelper::floor_div_two(m);
2355
+ }
2356
+
2357
+ return pmul(y, result);
2358
+ }
2359
+
2360
+ template <typename Packet>
2361
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
2362
+ const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
2363
+ const Packet exponent_packet = pset1<Packet>(exponent);
2364
+ return generic_pow_impl(x, exponent_packet);
2365
+ }
2366
+
2367
+ template <typename Scalar>
2368
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
2369
+ const Scalar& x, const Scalar& exponent) {
2370
+ return numext::pow(x, exponent);
2371
+ }
2372
+
2373
+ template <typename Packet, typename ScalarExponent>
2374
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
2375
+ const ScalarExponent& exponent) {
2376
+ using Scalar = typename unpacket_traits<Packet>::type;
2377
+
2378
+ // non-integer base and exponent case
2379
+ const Packet cst_pos_zero = pzero(x);
2380
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2381
+ const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
2382
+ const Packet cst_true = ptrue<Packet>(x);
2383
+
2384
+ const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
2385
+ const bool exponent_is_neg = exponent < ScalarExponent(0);
2386
+ const bool exponent_is_pos = exponent > ScalarExponent(0);
2387
+
2388
+ const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
2389
+ const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
2390
+ const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
2391
+ const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
2392
+ const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
2393
+
2394
+ const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
2395
+ const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
2396
+ const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
2397
+
2398
+ const Packet abs_x = pabs(x);
2399
+ const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
2400
+ const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
2401
+ const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
2402
+ const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
2403
+
2404
+ Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
2405
+ Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
2406
+ Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
2407
+
2408
+ Packet result = powx;
2409
+ result = por(x_is_le_zero, result);
2410
+ result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
2411
+ result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
2412
+ result = por(exp_is_nan, result);
2413
+ result = pselect(pow_is_one, cst_pos_one, result);
2414
+ return result;
2415
+ }
2416
+
2417
+ template <typename Packet, typename ScalarExponent,
2418
+ std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
2419
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
2420
+ using Scalar = typename unpacket_traits<Packet>::type;
2421
+
2422
+ // signed integer base, signed integer exponent case
2423
+
2424
+ // This routine handles negative exponents.
2425
+ // The return value is either 0, 1, or -1.
2426
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2427
+ const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
2428
+ const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
2429
+
2430
+ const Packet abs_x = pabs(x);
2431
+ const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
2432
+
2433
+ Packet result = pselect(exp_is_odd, x, abs_x);
2434
+ result = pselect(abs_x_is_one, result, pzero<Packet>(x));
2435
+ return result;
2436
+ }
2437
+
2438
+ template <typename Packet, typename ScalarExponent,
2439
+ std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
2440
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
2441
+ using Scalar = typename unpacket_traits<Packet>::type;
2442
+
2443
+ // unsigned integer base, signed integer exponent case
2444
+
2445
+ // This routine handles negative exponents.
2446
+ // The return value is either 0 or 1
2447
+
2448
+ const Scalar pos_one = Scalar(1);
2449
+
2450
+ const Packet cst_pos_one = pset1<Packet>(pos_one);
2451
+
2452
+ const Packet x_is_one = pcmp_eq(x, cst_pos_one);
2453
+
2454
+ return pand(x_is_one, x);
2455
+ }
2456
+
2457
+ } // end namespace unary_pow
2458
+
2459
+ template <typename Packet, typename ScalarExponent,
2460
+ bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
2461
+ bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
2462
+ bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
2463
+ struct unary_pow_impl;
2464
+
2465
+ template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
2466
+ struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
2467
+ typedef typename unpacket_traits<Packet>::type Scalar;
2468
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2469
+ const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
2470
+ if (exponent_is_integer) {
2471
+ // The simple recursive doubling implementation is only accurate to 3 ulps
2472
+ // for integer exponents in [-3:7]. Since this is a common case, we
2473
+ // specialize it here.
2474
+ bool use_repeated_squaring =
2475
+ (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
2476
+ return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
2477
+ } else {
2478
+ Packet result = unary_pow::gen_pow(x, exponent);
2479
+ result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
2480
+ return result;
2481
+ }
2482
+ }
2483
+ };
2484
+
2485
+ template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
2486
+ struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
2487
+ typedef typename unpacket_traits<Packet>::type Scalar;
2488
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2489
+ return unary_pow::int_pow(x, exponent);
2490
+ }
2491
+ };
2492
+
2493
+ template <typename Packet, typename ScalarExponent>
2494
+ struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
2495
+ typedef typename unpacket_traits<Packet>::type Scalar;
2496
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2497
+ if (exponent < ScalarExponent(0)) {
2498
+ return unary_pow::handle_negative_exponent(x, exponent);
2499
+ } else {
2500
+ return unary_pow::int_pow(x, exponent);
2501
+ }
2502
+ }
2503
+ };
2504
+
2505
+ template <typename Packet, typename ScalarExponent>
2506
+ struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
2507
+ typedef typename unpacket_traits<Packet>::type Scalar;
2508
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2509
+ return unary_pow::int_pow(x, exponent);
2510
+ }
2511
+ };
2512
+
2513
+ // This function computes exp2(x) = exp(ln(2) * x).
2514
+ // To improve accuracy, the product ln(2)*x is computed using the twoprod
2515
+ // algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
2516
+ // computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
2517
+ // correction step this reduces the maximum absolute error as follows:
2518
+ //
2519
+ // type | max error (simple product) | max error (twoprod) |
2520
+ // -----------------------------------------------------------
2521
+ // float | 35 ulps | 4 ulps |
2522
+ // double | 363 ulps | 110 ulps |
2523
+ //
2524
+ template <typename Packet>
2525
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
2526
+ typedef typename unpacket_traits<Packet>::type Scalar;
2527
+ constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
2528
+ constexpr int digits = std::numeric_limits<Scalar>::digits;
2529
+ constexpr Scalar max_cap = Scalar(max_exponent + 1);
2530
+ constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
2531
+ Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
2532
+ Packet p_hi, p_lo;
2533
+ twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
2534
+ Packet exp2_hi = pexp(p_hi);
2535
+ Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
2536
+ return pmul(exp2_hi, exp2_lo);
2537
+ }
2538
+
2539
+ template <typename Packet>
2540
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
2541
+ using Scalar = typename unpacket_traits<Packet>::type;
2542
+ using IntType = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
2543
+ // Adds and subtracts signum(a) * 2^kMantissaBits to force rounding.
2544
+ const IntType kLimit = IntType(1) << (NumTraits<Scalar>::digits() - 1);
2545
+ const Packet cst_limit = pset1<Packet>(static_cast<Scalar>(kLimit));
2546
+ Packet abs_a = pabs(a);
2547
+ Packet sign_a = pandnot(a, abs_a);
2548
+ Packet rint_a = padd(abs_a, cst_limit);
2549
+ // Don't compile-away addition and subtraction.
2550
+ EIGEN_OPTIMIZATION_BARRIER(rint_a);
2551
+ rint_a = psub(rint_a, cst_limit);
2552
+ rint_a = por(rint_a, sign_a);
2553
+ // If greater than limit (or NaN), simply return a.
2554
+ Packet mask = pcmp_lt(abs_a, cst_limit);
2555
+ Packet result = pselect(mask, rint_a, a);
2556
+ return result;
2557
+ }
2558
+
2559
+ template <typename Packet>
2560
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
2561
+ using Scalar = typename unpacket_traits<Packet>::type;
2562
+ const Packet cst_1 = pset1<Packet>(Scalar(1));
2563
+ Packet rint_a = generic_rint(a);
2564
+ // if a < rint(a), then rint(a) == ceil(a)
2565
+ Packet mask = pcmp_lt(a, rint_a);
2566
+ Packet offset = pand(cst_1, mask);
2567
+ Packet result = psub(rint_a, offset);
2568
+ return result;
2569
+ }
2570
+
2571
+ template <typename Packet>
2572
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
2573
+ using Scalar = typename unpacket_traits<Packet>::type;
2574
+ const Packet cst_1 = pset1<Packet>(Scalar(1));
2575
+ const Packet sign_mask = pset1<Packet>(static_cast<Scalar>(-0.0));
2576
+ Packet rint_a = generic_rint(a);
2577
+ // if rint(a) < a, then rint(a) == floor(a)
2578
+ Packet mask = pcmp_lt(rint_a, a);
2579
+ Packet offset = pand(cst_1, mask);
2580
+ Packet result = padd(rint_a, offset);
2581
+ // Signed zero must remain signed (e.g. ceil(-0.02) == -0).
2582
+ result = por(result, pand(sign_mask, a));
2583
+ return result;
2584
+ }
2585
+
2586
+ template <typename Packet>
2587
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
2588
+ Packet abs_a = pabs(a);
2589
+ Packet sign_a = pandnot(a, abs_a);
2590
+ Packet floor_abs_a = generic_floor(abs_a);
2591
+ Packet result = por(floor_abs_a, sign_a);
2592
+ return result;
2593
+ }
2594
+
2595
+ template <typename Packet>
2596
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
2597
+ using Scalar = typename unpacket_traits<Packet>::type;
2598
+ const Packet cst_half = pset1<Packet>(Scalar(0.5));
2599
+ const Packet cst_1 = pset1<Packet>(Scalar(1));
2600
+ Packet abs_a = pabs(a);
2601
+ Packet sign_a = pandnot(a, abs_a);
2602
+ Packet floor_abs_a = generic_floor(abs_a);
2603
+ Packet diff = psub(abs_a, floor_abs_a);
2604
+ Packet mask = pcmp_le(cst_half, diff);
2605
+ Packet offset = pand(cst_1, mask);
2606
+ Packet result = padd(floor_abs_a, offset);
2607
+ result = por(result, sign_a);
2608
+ return result;
2609
+ }
2610
+
2611
+ template <typename Packet>
2612
+ struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ false> {
2613
+ using Scalar = typename unpacket_traits<Packet>::type;
2614
+ static_assert(packet_traits<Scalar>::HasRound, "Generic nearest integer functions are disabled for this type.");
2615
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return generic_floor(x); }
2616
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return generic_ceil(x); }
2617
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return generic_rint(x); }
2618
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return generic_round(x); }
2619
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return generic_trunc(x); }
2620
+ };
2621
+
2622
+ template <typename Packet>
2623
+ struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ true> {
2624
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return x; }
2625
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return x; }
2626
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return x; }
2627
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return x; }
2628
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return x; }
2629
+ };
2630
+
2631
+ } // end namespace internal
2632
+ } // end namespace Eigen
2633
+
2634
+ #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H