@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -16,73 +16,214 @@
16
16
  #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
17
17
  #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
18
18
 
19
+ // IWYU pragma: private
20
+ #include "../../InternalHeaderCheck.h"
21
+
19
22
  namespace Eigen {
20
23
  namespace internal {
21
24
 
22
25
  // Creates a Scalar integer type with same bit-width.
23
- template<typename T> struct make_integer;
24
- template<> struct make_integer<float> { typedef numext::int32_t type; };
25
- template<> struct make_integer<double> { typedef numext::int64_t type; };
26
- template<> struct make_integer<half> { typedef numext::int16_t type; };
27
- template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
28
-
29
- template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
30
- Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
26
+ template <typename T>
27
+ struct make_integer;
28
+ template <>
29
+ struct make_integer<float> {
30
+ typedef numext::int32_t type;
31
+ };
32
+ template <>
33
+ struct make_integer<double> {
34
+ typedef numext::int64_t type;
35
+ };
36
+ template <>
37
+ struct make_integer<half> {
38
+ typedef numext::int16_t type;
39
+ };
40
+ template <>
41
+ struct make_integer<bfloat16> {
42
+ typedef numext::int16_t type;
43
+ };
44
+
45
+ /* polevl (modified for Eigen)
46
+ *
47
+ * Evaluate polynomial
48
+ *
49
+ *
50
+ *
51
+ * SYNOPSIS:
52
+ *
53
+ * int N;
54
+ * Scalar x, y, coef[N+1];
55
+ *
56
+ * y = polevl<decltype(x), N>( x, coef);
57
+ *
58
+ *
59
+ *
60
+ * DESCRIPTION:
61
+ *
62
+ * Evaluates polynomial of degree N:
63
+ *
64
+ * 2 N
65
+ * y = C + C x + C x +...+ C x
66
+ * 0 1 2 N
67
+ *
68
+ * Coefficients are stored in reverse order:
69
+ *
70
+ * coef[0] = C , ..., coef[N] = C .
71
+ * N 0
72
+ *
73
+ * The function p1evl() assumes that coef[N] = 1.0 and is
74
+ * omitted from the array. Its calling arguments are
75
+ * otherwise the same as polevl().
76
+ *
77
+ *
78
+ * The Eigen implementation is templatized. For best speed, store
79
+ * coef as a const array (constexpr), e.g.
80
+ *
81
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
82
+ *
83
+ */
84
+ template <typename Packet, int N>
85
+ struct ppolevl {
86
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
87
+ const typename unpacket_traits<Packet>::type coeff[]) {
88
+ EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
89
+ return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
90
+ }
91
+ };
92
+
93
+ template <typename Packet>
94
+ struct ppolevl<Packet, 0> {
95
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
96
+ const typename unpacket_traits<Packet>::type coeff[]) {
97
+ EIGEN_UNUSED_VARIABLE(x);
98
+ return pset1<Packet>(coeff[0]);
99
+ }
100
+ };
101
+
102
+ /* chbevl (modified for Eigen)
103
+ *
104
+ * Evaluate Chebyshev series
105
+ *
106
+ *
107
+ *
108
+ * SYNOPSIS:
109
+ *
110
+ * int N;
111
+ * Scalar x, y, coef[N], chebevl();
112
+ *
113
+ * y = chbevl( x, coef, N );
114
+ *
115
+ *
116
+ *
117
+ * DESCRIPTION:
118
+ *
119
+ * Evaluates the series
120
+ *
121
+ * N-1
122
+ * - '
123
+ * y = > coef[i] T (x/2)
124
+ * - i
125
+ * i=0
126
+ *
127
+ * of Chebyshev polynomials Ti at argument x/2.
128
+ *
129
+ * Coefficients are stored in reverse order, i.e. the zero
130
+ * order term is last in the array. Note N is the number of
131
+ * coefficients, not the order.
132
+ *
133
+ * If coefficients are for the interval a to b, x must
134
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
135
+ * entering the routine. This maps x from (a, b) to (-1, 1),
136
+ * over which the Chebyshev polynomials are defined.
137
+ *
138
+ * If the coefficients are for the inverted interval, in
139
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
140
+ * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
141
+ * this becomes x -> 4a/x - 1.
142
+ *
143
+ *
144
+ *
145
+ * SPEED:
146
+ *
147
+ * Taking advantage of the recurrence properties of the
148
+ * Chebyshev polynomials, the routine requires one more
149
+ * addition per loop than evaluating a nested polynomial of
150
+ * the same degree.
151
+ *
152
+ */
153
+
154
+ template <typename Packet, int N>
155
+ struct pchebevl {
156
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
157
+ const typename unpacket_traits<Packet>::type coef[]) {
158
+ typedef typename unpacket_traits<Packet>::type Scalar;
159
+ Packet b0 = pset1<Packet>(coef[0]);
160
+ Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
161
+ Packet b2;
162
+
163
+ for (int i = 1; i < N; i++) {
164
+ b2 = b1;
165
+ b1 = b0;
166
+ b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
167
+ }
168
+
169
+ return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
170
+ }
171
+ };
172
+
173
+ template <typename Packet>
174
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
31
175
  typedef typename unpacket_traits<Packet>::type Scalar;
32
176
  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
33
- enum { mantissa_bits = numext::numeric_limits<Scalar>::digits - 1};
177
+ static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
34
178
  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
35
179
  }
36
180
 
37
181
  // Safely applies frexp, correctly handles denormals.
38
182
  // Assumes IEEE floating point format.
39
- template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
40
- Packet pfrexp_generic(const Packet& a, Packet& exponent) {
183
+ template <typename Packet>
184
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
41
185
  typedef typename unpacket_traits<Packet>::type Scalar;
42
186
  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
43
- enum {
44
- TotalBits = sizeof(Scalar) * CHAR_BIT,
45
- MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
46
- ExponentBits = int(TotalBits) - int(MantissaBits) - 1
47
- };
48
-
49
- EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
50
- ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000
51
- const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
187
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
188
+ ExponentBits = TotalBits - MantissaBits - 1;
189
+
190
+ constexpr ScalarUI scalar_sign_mantissa_mask =
191
+ ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
192
+ const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
52
193
  const Packet half = pset1<Packet>(Scalar(0.5));
53
194
  const Packet zero = pzero(a);
54
- const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
55
-
195
+ const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
196
+
56
197
  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
57
198
  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
58
- EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24
199
+ constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
59
200
  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
60
- const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
61
- const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
201
+ const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
202
+ const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
62
203
  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
63
-
204
+
64
205
  // Determine exponent offset: -126 if normal, -126-24 if denormal
65
- const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126
206
+ const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2)); // -126
66
207
  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
67
- const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
208
+ const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
68
209
  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
69
-
210
+
70
211
  // Determine exponent and mantissa from normalized_a.
71
212
  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
72
213
  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
73
214
  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
74
- const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)); // 255
215
+ const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1)); // 255
75
216
  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
76
217
  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
77
218
  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
78
- exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
219
+ exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
79
220
  return m;
80
221
  }
81
222
 
82
223
  // Safely applies ldexp, correctly handles overflows, underflows and denormals.
83
224
  // Assumes IEEE floating point format.
84
- template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
85
- Packet pldexp_generic(const Packet& a, const Packet& exponent) {
225
+ template <typename Packet>
226
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
86
227
  // We want to return a * 2^exponent, allowing for all possible integer
87
228
  // exponents without overflowing or underflowing in intermediate
88
229
  // computations.
@@ -91,7 +232,7 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) {
91
232
  // to consider for a float is:
92
233
  // -255-23 -> 255+23
93
234
  // Below -278 any finite float 'a' will become zero, and above +278 any
94
- // finite float will become inf, including when 'a' is the smallest possible
235
+ // finite float will become inf, including when 'a' is the smallest possible
95
236
  // denormal.
96
237
  //
97
238
  // Unfortunately, 2^(278) cannot be represented using either one or two
@@ -108,25 +249,22 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) {
108
249
  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
109
250
  typedef typename unpacket_traits<Packet>::type Scalar;
110
251
  typedef typename unpacket_traits<PacketI>::type ScalarI;
111
- enum {
112
- TotalBits = sizeof(Scalar) * CHAR_BIT,
113
- MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
114
- ExponentBits = int(TotalBits) - int(MantissaBits) - 1
115
- };
116
-
117
- const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) + ScalarI(int(MantissaBits) - 1))); // 278
118
- const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127
252
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
253
+ ExponentBits = TotalBits - MantissaBits - 1;
254
+
255
+ const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1))); // 278
256
+ const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)); // 127
119
257
  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
120
- PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
121
- Packet c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias))); // 2^b
122
- Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
123
- b = psub(psub(psub(e, b), b), b); // e - 3b
124
- c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias))); // 2^(e-3*b)
258
+ PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
259
+ Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^b
260
+ Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
261
+ b = pnmadd(pset1<PacketI>(3), b, e); // e - 3b
262
+ c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^(e-3*b)
125
263
  out = pmul(out, c);
126
264
  return out;
127
265
  }
128
266
 
129
- // Explicitly multiplies
267
+ // Explicitly multiplies
130
268
  // a * (2^e)
131
269
  // clamping e to the range
132
270
  // [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
@@ -135,27 +273,157 @@ Packet pldexp_generic(const Packet& a, const Packet& exponent) {
135
273
  // if 2^e doesn't fit into a normal floating-point Scalar.
136
274
  //
137
275
  // Assumes IEEE floating point format
138
- template<typename Packet>
139
- struct pldexp_fast_impl {
276
+ template <typename Packet>
277
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
140
278
  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
141
279
  typedef typename unpacket_traits<Packet>::type Scalar;
142
280
  typedef typename unpacket_traits<PacketI>::type ScalarI;
143
- enum {
144
- TotalBits = sizeof(Scalar) * CHAR_BIT,
145
- MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
146
- ExponentBits = int(TotalBits) - int(MantissaBits) - 1
147
- };
148
-
149
- static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
150
- Packet run(const Packet& a, const Packet& exponent) {
151
- const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1))); // 127
152
- const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) - ScalarI(1))); // 255
153
- // restrict biased exponent between 0 and 255 for float.
154
- const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
155
- // return a * (2^e)
156
- return pmul(a, preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(e)));
157
- }
158
- };
281
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
282
+ ExponentBits = TotalBits - MantissaBits - 1;
283
+
284
+ const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127
285
+ const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255
286
+ // restrict biased exponent between 0 and 255 for float.
287
+ const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
288
+ // return a * (2^e)
289
+ return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
290
+ }
291
+
292
+ // This function implements a single step of Halley's iteration for
293
+ // computing x = y^(1/3):
294
+ // x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
295
+ template <typename Packet>
296
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
297
+ const Packet& y) {
298
+ typedef typename unpacket_traits<Packet>::type Scalar;
299
+ Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
300
+ Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
301
+ Packet num = psub(x_k_cb, y);
302
+ Packet r = pdiv(num, denom);
303
+ return pnmadd(x_k, r, x_k);
304
+ }
305
+
306
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
307
+ // interval [0.125,1].
308
+ template <typename Packet>
309
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
310
+ typedef typename unpacket_traits<Packet>::type Scalar;
311
+ // Extract the significant s in the range [0.5,1) and exponent e, such that
312
+ // x = 2^e * s.
313
+ Packet e, s;
314
+ s = pfrexp(x, e);
315
+
316
+ // Split the exponent into a part divisible by 3 and the remainder.
317
+ // e = 3*e_div3 + e_mod3.
318
+ constexpr Scalar kOneThird = Scalar(1) / 3;
319
+ e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
320
+ Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
321
+
322
+ // Replace s by y = (s * 2^e_mod3).
323
+ return pldexp_fast(s, e_mod3);
324
+ }
325
+
326
+ template <typename Packet>
327
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
328
+ const Packet& abs_root) {
329
+ typedef typename unpacket_traits<Packet>::type Scalar;
330
+
331
+ // Set sign.
332
+ const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
333
+ const Packet x_sign = pand(sign_mask, x);
334
+ Packet root = por(x_sign, abs_root);
335
+
336
+ // Pass non-finite and zero values of x straight through.
337
+ const Packet is_not_finite = por(pisinf(x), pisnan(x));
338
+ const Packet is_zero = pcmp_eq(pzero(x), x);
339
+ const Packet use_x = por(is_not_finite, is_zero);
340
+ return pselect(use_x, x, root);
341
+ }
342
+
343
+ // Generic implementation of cbrt(x) for float.
344
+ //
345
+ // The algorithm computes the cubic root of the input by first
346
+ // decomposing it into a exponent and significant
347
+ // x = s * 2^e.
348
+ //
349
+ // We can then write the cube root as
350
+ //
351
+ // x^(1/3) = 2^(e/3) * s^(1/3)
352
+ // = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
353
+ // = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
354
+ // = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
355
+ //
356
+ // where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
357
+ //
358
+ // The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
359
+ // approximated using a cubic polynomial and subsequently refined using a
360
+ // single step of Halley's iteration, and finally the two terms are combined
361
+ // using pldexp_fast.
362
+ //
363
+ // Note: Many alternatives exist for implementing cbrt. See, for example,
364
+ // the excellent discussion in Kahan's note:
365
+ // https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
366
+ // This particular implementation was found to be very fast and accurate
367
+ // among several alternatives tried, but is probably not "optimal" on all
368
+ // platforms.
369
+ //
370
+ // This is accurate to 2 ULP.
371
+ template <typename Packet>
372
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
373
+ typedef typename unpacket_traits<Packet>::type Scalar;
374
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
375
+
376
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
377
+ // interval [0.125,1].
378
+ Packet e_div3;
379
+ const Packet y = cbrt_decompose(pabs(x), e_div3);
380
+
381
+ // Compute initial approximation accurate to 5.22e-3.
382
+ // The polynomial was computed using Rminimax.
383
+ constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
384
+ 3.408401906490325927734375e-01f};
385
+ Packet r = ppolevl<Packet, 3>::run(y, alpha);
386
+
387
+ // Take one step of Halley's iteration.
388
+ r = cbrt_halley_iteration_step(r, y);
389
+
390
+ // Finally multiply by 2^(e_div3)
391
+ r = pldexp_fast(r, e_div3);
392
+
393
+ return cbrt_special_cases_and_sign(x, r);
394
+ }
395
+
396
+ // Generic implementation of cbrt(x) for double.
397
+ //
398
+ // The algorithm is identical to the one for float except that a different initial
399
+ // approximation is used for y^(1/3) and two Halley iteration steps are peformed.
400
+ //
401
+ // This is accurate to 1 ULP.
402
+ template <typename Packet>
403
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
404
+ typedef typename unpacket_traits<Packet>::type Scalar;
405
+ static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
406
+
407
+ // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
408
+ // interval [0.125,1].
409
+ Packet e_div3;
410
+ const Packet y = cbrt_decompose(pabs(x), e_div3);
411
+
412
+ // Compute initial approximation accurate to 0.016.
413
+ // The polynomial was computed using Rminimax.
414
+ constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
415
+ 1.072314636518546304699839311069808900356292724609375e+00,
416
+ 3.81249427609571867048288140722434036433696746826171875e-01};
417
+ Packet r = ppolevl<Packet, 2>::run(y, alpha);
418
+
419
+ // Take two steps of Halley's iteration.
420
+ r = cbrt_halley_iteration_step(r, y);
421
+ r = cbrt_halley_iteration_step(r, y);
422
+
423
+ // Finally multiply by 2^(e_div3).
424
+ r = pldexp_fast(r, e_div3);
425
+ return cbrt_special_cases_and_sign(x, r);
426
+ }
159
427
 
160
428
  // Natural or base 2 logarithm.
161
429
  // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
@@ -164,37 +432,15 @@ struct pldexp_fast_impl {
164
432
  // TODO(gonnet): Further reduce the interval allowing for lower-degree
165
433
  // polynomial interpolants -> ... -> profit!
166
434
  template <typename Packet, bool base2>
167
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
168
- EIGEN_UNUSED
169
- Packet plog_impl_float(const Packet _x)
170
- {
171
- Packet x = _x;
172
-
173
- const Packet cst_1 = pset1<Packet>(1.0f);
174
- const Packet cst_neg_half = pset1<Packet>(-0.5f);
175
- // The smallest non denormalized float number.
176
- const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
177
- const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
178
- const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u);
435
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
436
+ const Packet cst_1 = pset1<Packet>(1.0f);
437
+ const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
438
+ const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
179
439
 
180
- // Polynomial coefficients.
181
440
  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
182
- const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
183
- const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
184
- const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
185
- const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
186
- const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
187
- const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
188
- const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
189
- const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
190
- const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
191
-
192
- // Truncate input values to the minimum positive normal.
193
- x = pmax(x, cst_min_norm_pos);
194
-
195
- Packet e;
441
+ Packet e, x;
196
442
  // extract significant in the range [0.5,1) and exponent
197
- x = pfrexp(x,e);
443
+ x = pfrexp(_x, e);
198
444
 
199
445
  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
200
446
  // and shift by -1. The values are then centered around 0, which improves
@@ -209,24 +455,15 @@ Packet plog_impl_float(const Packet _x)
209
455
  e = psub(e, pand(cst_1, mask));
210
456
  x = padd(x, tmp);
211
457
 
212
- Packet x2 = pmul(x, x);
213
- Packet x3 = pmul(x2, x);
214
-
215
- // Evaluate the polynomial approximant of degree 8 in three parts, probably
216
- // to improve instruction-level parallelism.
217
- Packet y, y1, y2;
218
- y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
219
- y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
220
- y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
221
- y = pmadd(y, x, cst_cephes_log_p2);
222
- y1 = pmadd(y1, x, cst_cephes_log_p5);
223
- y2 = pmadd(y2, x, cst_cephes_log_p8);
224
- y = pmadd(y, x3, y1);
225
- y = pmadd(y, x3, y2);
226
- y = pmul(y, x3);
458
+ // Polynomial coefficients for rational r(x) = p(x)/q(x)
459
+ // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
460
+ constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
461
+ constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
227
462
 
228
- y = pmadd(cst_neg_half, x2, y);
229
- x = padd(x, y);
463
+ Packet p = ppolevl<Packet, 2>::run(x, alpha);
464
+ p = pmul(x, p);
465
+ Packet q = ppolevl<Packet, 3>::run(x, beta);
466
+ x = pdiv(p, q);
230
467
 
231
468
  // Add the logarithm of the exponent back to the result of the interpolation.
232
469
  if (base2) {
@@ -238,29 +475,22 @@ Packet plog_impl_float(const Packet _x)
238
475
  }
239
476
 
240
477
  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
241
- Packet iszero_mask = pcmp_eq(_x,pzero(_x));
242
- Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
478
+ Packet iszero_mask = pcmp_eq(_x, pzero(_x));
479
+ Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
243
480
  // Filter out invalid inputs, i.e.:
244
481
  // - negative arg will be NAN
245
482
  // - 0 will be -INF
246
483
  // - +INF will be +INF
247
- return pselect(iszero_mask, cst_minus_inf,
248
- por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
484
+ return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
249
485
  }
250
486
 
251
487
  template <typename Packet>
252
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
253
- EIGEN_UNUSED
254
- Packet plog_float(const Packet _x)
255
- {
488
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) {
256
489
  return plog_impl_float<Packet, /* base2 */ false>(_x);
257
490
  }
258
491
 
259
492
  template <typename Packet>
260
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
261
- EIGEN_UNUSED
262
- Packet plog2_float(const Packet _x)
263
- {
493
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) {
264
494
  return plog_impl_float<Packet, /* base2 */ true>(_x);
265
495
  }
266
496
 
@@ -274,22 +504,16 @@ Packet plog2_float(const Packet _x)
274
504
  * for more detail see: http://www.netlib.org/cephes/
275
505
  */
276
506
  template <typename Packet, bool base2>
277
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
278
- EIGEN_UNUSED
279
- Packet plog_impl_double(const Packet _x)
280
- {
507
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
281
508
  Packet x = _x;
282
509
 
283
- const Packet cst_1 = pset1<Packet>(1.0);
284
- const Packet cst_neg_half = pset1<Packet>(-0.5);
285
- // The smallest non denormalized double.
286
- const Packet cst_min_norm_pos = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
287
- const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
288
- const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
289
-
510
+ const Packet cst_1 = pset1<Packet>(1.0);
511
+ const Packet cst_neg_half = pset1<Packet>(-0.5);
512
+ const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
513
+ const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
290
514
 
291
- // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
292
- // 1/sqrt(2) <= x < sqrt(2)
515
+ // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
516
+ // 1/sqrt(2) <= x < sqrt(2)
293
517
  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
294
518
  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
295
519
  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
@@ -305,13 +529,10 @@ Packet plog_impl_double(const Packet _x)
305
529
  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
306
530
  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
307
531
 
308
- // Truncate input values to the minimum positive normal.
309
- x = pmax(x, cst_min_norm_pos);
310
-
311
532
  Packet e;
312
533
  // extract significant in the range [0.5,1) and exponent
313
- x = pfrexp(x,e);
314
-
534
+ x = pfrexp(x, e);
535
+
315
536
  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
316
537
  // and shift by -1. The values are then centered around 0, which improves
317
538
  // the stability of the polynomial evaluation.
@@ -331,20 +552,20 @@ Packet plog_impl_double(const Packet _x)
331
552
  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
332
553
  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
333
554
  Packet y, y1, y_;
334
- y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
555
+ y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
335
556
  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
336
- y = pmadd(y, x, cst_cephes_log_p2);
557
+ y = pmadd(y, x, cst_cephes_log_p2);
337
558
  y1 = pmadd(y1, x, cst_cephes_log_p5);
338
559
  y_ = pmadd(y, x3, y1);
339
560
 
340
- y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
561
+ y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
341
562
  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
342
- y = pmadd(y, x, cst_cephes_log_q2);
563
+ y = pmadd(y, x, cst_cephes_log_q2);
343
564
  y1 = pmadd(y1, x, cst_cephes_log_q5);
344
- y = pmadd(y, x3, y1);
565
+ y = pmadd(y, x3, y1);
345
566
 
346
567
  y_ = pmul(y_, x3);
347
- y = pdiv(y_, y);
568
+ y = pdiv(y_, y);
348
569
 
349
570
  y = pmadd(cst_neg_half, x2, y);
350
571
  x = padd(x, y);
@@ -359,38 +580,30 @@ Packet plog_impl_double(const Packet _x)
359
580
  }
360
581
 
361
582
  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
362
- Packet iszero_mask = pcmp_eq(_x,pzero(_x));
363
- Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
583
+ Packet iszero_mask = pcmp_eq(_x, pzero(_x));
584
+ Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
364
585
  // Filter out invalid inputs, i.e.:
365
586
  // - negative arg will be NAN
366
587
  // - 0 will be -INF
367
588
  // - +INF will be +INF
368
- return pselect(iszero_mask, cst_minus_inf,
369
- por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
589
+ return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
370
590
  }
371
591
 
372
592
  template <typename Packet>
373
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
374
- EIGEN_UNUSED
375
- Packet plog_double(const Packet _x)
376
- {
593
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) {
377
594
  return plog_impl_double<Packet, /* base2 */ false>(_x);
378
595
  }
379
596
 
380
597
  template <typename Packet>
381
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
382
- EIGEN_UNUSED
383
- Packet plog2_double(const Packet _x)
384
- {
598
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) {
385
599
  return plog_impl_double<Packet, /* base2 */ true>(_x);
386
600
  }
387
601
 
388
602
  /** \internal \returns log(1 + x) computed using W. Kahan's formula.
389
603
  See: http://www.plunk.org/~hatch/rightway.php
390
604
  */
391
- template<typename Packet>
392
- Packet generic_plog1p(const Packet& x)
393
- {
605
+ template <typename Packet>
606
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x) {
394
607
  typedef typename unpacket_traits<Packet>::type ScalarType;
395
608
  const Packet one = pset1<Packet>(ScalarType(1));
396
609
  Packet xp1 = padd(x, one);
@@ -404,9 +617,8 @@ Packet generic_plog1p(const Packet& x)
404
617
  /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
405
618
  See: http://www.plunk.org/~hatch/rightway.php
406
619
  */
407
- template<typename Packet>
408
- Packet generic_expm1(const Packet& x)
409
- {
620
+ template <typename Packet>
621
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x) {
410
622
  typedef typename unpacket_traits<Packet>::type ScalarType;
411
623
  const Packet one = pset1<Packet>(ScalarType(1));
412
624
  const Packet neg_one = pset1<Packet>(ScalarType(-1));
@@ -422,37 +634,32 @@ Packet generic_expm1(const Packet& x)
422
634
  Packet pos_inf_mask = pcmp_eq(logu, u);
423
635
  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
424
636
  expm1 = pselect(pos_inf_mask, u, expm1);
425
- return pselect(one_mask,
426
- x,
427
- pselect(neg_one_mask,
428
- neg_one,
429
- expm1));
637
+ return pselect(one_mask, x, pselect(neg_one_mask, neg_one, expm1));
430
638
  }
431
639
 
432
-
433
640
  // Exponential function. Works by writing "x = m*log(2) + r" where
434
641
  // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
435
642
  // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
643
+ // exp(r) is computed using a 6th order minimax polynomial approximation.
436
644
  template <typename Packet>
437
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
438
- EIGEN_UNUSED
439
- Packet pexp_float(const Packet _x)
440
- {
441
- const Packet cst_1 = pset1<Packet>(1.0f);
442
- const Packet cst_half = pset1<Packet>(0.5f);
443
- const Packet cst_exp_hi = pset1<Packet>( 88.723f);
444
- const Packet cst_exp_lo = pset1<Packet>(-88.723f);
645
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
646
+ const Packet cst_zero = pset1<Packet>(0.0f);
647
+ const Packet cst_one = pset1<Packet>(1.0f);
648
+ const Packet cst_half = pset1<Packet>(0.5f);
649
+ const Packet cst_exp_hi = pset1<Packet>(88.723f);
650
+ const Packet cst_exp_lo = pset1<Packet>(-104.f);
651
+ const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
445
652
 
446
653
  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
447
- const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
448
- const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
449
- const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
450
- const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
451
- const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
452
- const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
654
+ const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
655
+ const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
656
+ const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
657
+ const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
658
+ const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
453
659
 
454
660
  // Clamp x.
455
- Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
661
+ Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
662
+ Packet x = pmin(_x, cst_exp_hi);
456
663
 
457
664
  // Express exp(x) as exp(m*ln(2) + r), start by extracting
458
665
  // m = floor(x/ln(2) + 0.5).
@@ -466,38 +673,37 @@ Packet pexp_float(const Packet _x)
466
673
  Packet r = pmadd(m, cst_cephes_exp_C1, x);
467
674
  r = pmadd(m, cst_cephes_exp_C2, r);
468
675
 
469
- Packet r2 = pmul(r, r);
470
- Packet r3 = pmul(r2, r);
471
-
472
- // Evaluate the polynomial approximant,improved by instruction-level parallelism.
473
- Packet y, y1, y2;
474
- y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
475
- y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
476
- y2 = padd(r, cst_1);
477
- y = pmadd(y, r, cst_cephes_exp_p2);
478
- y1 = pmadd(y1, r, cst_cephes_exp_p5);
479
- y = pmadd(y, r3, y1);
480
- y = pmadd(y, r2, y2);
676
+ // Evaluate the 6th order polynomial approximation to exp(r)
677
+ // with r in the interval [-ln(2)/2;ln(2)/2].
678
+ const Packet r2 = pmul(r, r);
679
+ Packet p_even = pmadd(r2, cst_p6, cst_p4);
680
+ const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
681
+ p_even = pmadd(r2, p_even, cst_p2);
682
+ const Packet p_low = padd(r, cst_one);
683
+ Packet y = pmadd(r, p_odd, p_even);
684
+ y = pmadd(r2, y, p_low);
481
685
 
482
686
  // Return 2^m * exp(r).
483
- // TODO: replace pldexp with faster implementation since y in [-1, 1).
484
- return pmax(pldexp(y,m), _x);
687
+ const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
688
+ if (!predux_any(fast_pldexp_unsafe)) {
689
+ // For |x| <= 87, we know the result is not zero or inf, and we can safely use
690
+ // the fast version of pldexp.
691
+ return pmax(pldexp_fast(y, m), _x);
692
+ }
693
+ return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
485
694
  }
486
695
 
487
696
  template <typename Packet>
488
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
489
- EIGEN_UNUSED
490
- Packet pexp_double(const Packet _x)
491
- {
697
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
492
698
  Packet x = _x;
493
-
699
+ const Packet cst_zero = pset1<Packet>(0.0);
494
700
  const Packet cst_1 = pset1<Packet>(1.0);
495
701
  const Packet cst_2 = pset1<Packet>(2.0);
496
702
  const Packet cst_half = pset1<Packet>(0.5);
497
703
 
498
704
  const Packet cst_exp_hi = pset1<Packet>(709.784);
499
- const Packet cst_exp_lo = pset1<Packet>(-709.784);
500
-
705
+ const Packet cst_exp_lo = pset1<Packet>(-745.519);
706
+ const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
501
707
  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
502
708
  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
503
709
  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
@@ -512,7 +718,8 @@ Packet pexp_double(const Packet _x)
512
718
  Packet tmp, fx;
513
719
 
514
720
  // clamp x
515
- x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
721
+ Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
722
+ x = pmin(x, cst_exp_hi);
516
723
  // Express exp(x) as exp(g + n*log(2)).
517
724
  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
518
725
 
@@ -549,8 +756,13 @@ Packet pexp_double(const Packet _x)
549
756
 
550
757
  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
551
758
  // non-finite values in the input.
552
- // TODO: replace pldexp with faster implementation since x in [-1, 1).
553
- return pmax(pldexp(x,fx), _x);
759
+ const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
760
+ if (!predux_any(fast_pldexp_unsafe)) {
761
+ // For |x| <= 708, we know the result is not zero or inf, and we can safely use
762
+ // the fast version of pldexp.
763
+ return pmax(pldexp_fast(x, fx), _x);
764
+ }
765
+ return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
554
766
  }
555
767
 
556
768
  // The following code is inspired by the following stack-overflow answer:
@@ -562,29 +774,22 @@ Packet pexp_double(const Packet _x)
562
774
  // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
563
775
  // - Avoid a branch in rounding and extraction of the remaining fractional part.
564
776
  // Overall, I measured a speed up higher than x2 on x86-64.
565
- inline float trig_reduce_huge (float xf, int *quadrant)
566
- {
777
+ inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
567
778
  using Eigen::numext::int32_t;
568
- using Eigen::numext::uint32_t;
569
779
  using Eigen::numext::int64_t;
780
+ using Eigen::numext::uint32_t;
570
781
  using Eigen::numext::uint64_t;
571
782
 
572
- const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
573
- const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
783
+ const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
784
+ const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
574
785
 
575
786
  // 192 bits of 2/pi for Payne-Hanek reduction
576
787
  // Bits are introduced by packet of 8 to enable aligned reads.
577
- static const uint32_t two_over_pi [] =
578
- {
579
- 0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
580
- 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
581
- 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
582
- 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
583
- 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
584
- 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
585
- 0x10e41000, 0xe4100000
586
- };
587
-
788
+ static const uint32_t two_over_pi[] = {
789
+ 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
790
+ 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
791
+ 0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
792
+
588
793
  uint32_t xi = numext::bit_cast<uint32_t>(xf);
589
794
  // Below, -118 = -126 + 8.
590
795
  // -126 is to get the exponent,
@@ -592,12 +797,12 @@ inline float trig_reduce_huge (float xf, int *quadrant)
592
797
  // This is possible because the fractional part of x as only 24 meaningful bits.
593
798
  uint32_t e = (xi >> 23) - 118;
594
799
  // Extract the mantissa and shift it to align it wrt the exponent
595
- xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
800
+ xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
596
801
 
597
802
  uint32_t i = e >> 3;
598
- uint32_t twoopi_1 = two_over_pi[i-1];
599
- uint32_t twoopi_2 = two_over_pi[i+3];
600
- uint32_t twoopi_3 = two_over_pi[i+7];
803
+ uint32_t twoopi_1 = two_over_pi[i - 1];
804
+ uint32_t twoopi_2 = two_over_pi[i + 3];
805
+ uint32_t twoopi_3 = two_over_pi[i + 7];
601
806
 
602
807
  // Compute x * 2/pi in 2.62-bit fixed-point format.
603
808
  uint64_t p;
@@ -612,46 +817,45 @@ inline float trig_reduce_huge (float xf, int *quadrant)
612
817
  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
613
818
  // r = (p-q)*pi/2,
614
819
  // where the product can be be carried out with sufficient accuracy using double precision.
615
- p -= q<<62;
820
+ p -= q << 62;
616
821
  return float(double(int64_t(p)) * pio2_62);
617
822
  }
618
823
 
619
- template<bool ComputeSine,typename Packet>
824
+ template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
620
825
  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
621
- EIGEN_UNUSED
622
- #if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
623
- __attribute__((optimize("-fno-unsafe-math-optimizations")))
826
+ #if EIGEN_COMP_GNUC_STRICT
827
+ __attribute__((optimize("-fno-unsafe-math-optimizations")))
624
828
  #endif
625
- Packet psincos_float(const Packet& _x)
626
- {
829
+ Packet
830
+ psincos_float(const Packet& _x) {
627
831
  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
628
832
 
629
- const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
630
- const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
631
- const PacketI csti_1 = pset1<PacketI>(1);
632
- const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
833
+ const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
834
+ const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
835
+ const PacketI csti_1 = pset1<PacketI>(1);
836
+ const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
633
837
 
634
838
  Packet x = pabs(_x);
635
839
 
636
840
  // Scale x by 2/Pi to find x's octant.
637
841
  Packet y = pmul(x, cst_2oPI);
638
842
 
639
- // Rounding trick:
843
+ // Rounding trick to find nearest integer:
640
844
  Packet y_round = padd(y, cst_rounding_magic);
641
845
  EIGEN_OPTIMIZATION_BARRIER(y_round)
642
- PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
643
- y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
846
+ PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
847
+ y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
644
848
 
645
- // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
646
- // using "Extended precision modular arithmetic"
647
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
648
- // This version requires true FMA for high accuracy
849
+ // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
850
+ // using "Extended precision modular arithmetic"
851
+ #if defined(EIGEN_VECTORIZE_FMA)
852
+ // This version requires true FMA for high accuracy.
649
853
  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
650
854
  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
651
855
  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
652
856
  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
653
857
  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
654
- #else
858
+ #else
655
859
  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
656
860
  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
657
861
  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
@@ -659,41 +863,38 @@ Packet psincos_float(const Packet& _x)
659
863
  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
660
864
  // and 2 ULP up to:
661
865
  const float huge_th = ComputeSine ? 25966.f : 18838.f;
662
- x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
866
+ x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
663
867
  EIGEN_OPTIMIZATION_BARRIER(x)
664
- x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
868
+ x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
665
869
  EIGEN_OPTIMIZATION_BARRIER(x)
666
- x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
667
- x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
668
-
669
- // For the record, the following set of coefficients maintain 2ULP up
670
- // to a slightly larger range:
671
- // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
672
- // but it slightly fails to maintain 1ULP for two values of sin below pi.
673
- // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
674
- // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
675
- // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
676
- // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
677
-
678
- // For the record, with only 3 iterations it is possible to maintain
679
- // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
680
- // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
681
- #endif
682
-
683
- if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
684
- {
870
+ x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
871
+ x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
872
+
873
+ // For the record, the following set of coefficients maintain 2ULP up
874
+ // to a slightly larger range:
875
+ // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
876
+ // but it slightly fails to maintain 1ULP for two values of sin below pi.
877
+ // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
878
+ // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
879
+ // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
880
+ // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
881
+
882
+ // For the record, with only 3 iterations it is possible to maintain
883
+ // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
884
+ // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
885
+ #endif
886
+
887
+ if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
685
888
  const int PacketSize = unpacket_traits<Packet>::size;
686
889
  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
687
890
  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
688
- EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
891
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
689
892
  pstoreu(vals, pabs(_x));
690
893
  pstoreu(x_cpy, x);
691
894
  pstoreu(y_int2, y_int);
692
- for(int k=0; k<PacketSize;++k)
693
- {
895
+ for (int k = 0; k < PacketSize; ++k) {
694
896
  float val = vals[k];
695
- if(val>=huge_th && (numext::isfinite)(val))
696
- x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
897
+ if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
697
898
  }
698
899
  x = ploadu<Packet>(x_cpy);
699
900
  y_int = ploadu<PacketI>(y_int2);
@@ -703,19 +904,19 @@ Packet psincos_float(const Packet& _x)
703
904
  // sin: sign = second_bit(y_int) xor signbit(_x)
704
905
  // cos: sign = second_bit(y_int+1)
705
906
  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
706
- : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
707
- sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
907
+ : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
908
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
708
909
 
709
910
  // Get the polynomial selection mask from the second bit of y_int
710
911
  // We'll calculate both (sin and cos) polynomials and then select from the two.
711
912
  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
712
913
 
713
- Packet x2 = pmul(x,x);
914
+ Packet x2 = pmul(x, x);
714
915
 
715
916
  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
716
- Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
717
- y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
718
- y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
917
+ Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
918
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
919
+ y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
719
920
  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
720
921
  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
721
922
 
@@ -727,66 +928,646 @@ Packet psincos_float(const Packet& _x)
727
928
  // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
728
929
  // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
729
930
  //
730
- Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
731
- y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
931
+ Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
932
+ y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
732
933
  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
733
934
  y2 = pmul(y2, x2);
734
935
  y2 = pmadd(y2, x, x);
735
936
 
736
937
  // Select the correct result from the two polynomials.
737
- y = ComputeSine ? pselect(poly_mask,y2,y1)
738
- : pselect(poly_mask,y1,y2);
739
-
938
+ if (ComputeBoth) {
939
+ Packet peven = peven_mask(x);
940
+ Packet ysin = pselect(poly_mask, y2, y1);
941
+ Packet ycos = pselect(poly_mask, y1, y2);
942
+ Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
943
+ Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
944
+ sign_bit_sin = pand(sign_bit_sin, cst_sign_mask); // clear all but left most bit
945
+ sign_bit_cos = pand(sign_bit_cos, cst_sign_mask); // clear all but left most bit
946
+ y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
947
+ } else {
948
+ y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
949
+ y = pxor(y, sign_bit);
950
+ }
740
951
  // Update the sign and filter huge inputs
741
- return pxor(y, sign_bit);
952
+ return y;
742
953
  }
743
954
 
744
- template<typename Packet>
745
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
746
- EIGEN_UNUSED
747
- Packet psin_float(const Packet& x)
748
- {
955
+ template <typename Packet>
956
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
749
957
  return psincos_float<true>(x);
750
958
  }
751
959
 
752
- template<typename Packet>
753
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
754
- EIGEN_UNUSED
755
- Packet pcos_float(const Packet& x)
756
- {
960
+ template <typename Packet>
961
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
757
962
  return psincos_float<false>(x);
758
963
  }
759
964
 
965
+ // Trigonometric argument reduction for double for inputs smaller than 15.
966
+ // Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
967
+ // count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
968
+ template <typename Packet>
969
+ Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
970
+ // Pi/2 split into 2 values
971
+ const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
972
+ const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
973
+
974
+ Packet t;
975
+ t = pmadd(cst_pio2_a, q, x);
976
+ t = pmadd(cst_pio2_b, q, t);
977
+ return t;
978
+ }
979
+
980
+ // Trigonometric argument reduction for double for inputs smaller than 1e14.
981
+ // Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
982
+ // count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
983
+ template <typename Packet>
984
+ Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
985
+ // Pi/2 split into 4 values
986
+ const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
987
+ const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
988
+ const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
989
+ const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
990
+
991
+ Packet t;
992
+ t = pmadd(cst_pio2_a, q_high, x);
993
+ t = pmadd(cst_pio2_a, q_low, t);
994
+ t = pmadd(cst_pio2_b, q_high, t);
995
+ t = pmadd(cst_pio2_b, q_low, t);
996
+ t = pmadd(cst_pio2_c, q_high, t);
997
+ t = pmadd(cst_pio2_c, q_low, t);
998
+ t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
999
+ return t;
1000
+ }
760
1001
 
761
- template<typename Packet>
1002
+ template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
762
1003
  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
763
- EIGEN_UNUSED
764
- Packet psqrt_complex(const Packet& a) {
765
- typedef typename unpacket_traits<Packet>::type Scalar;
766
- typedef typename Scalar::value_type RealScalar;
767
- typedef typename unpacket_traits<Packet>::as_real RealPacket;
1004
+ #if EIGEN_COMP_GNUC_STRICT
1005
+ __attribute__((optimize("-fno-unsafe-math-optimizations")))
1006
+ #endif
1007
+ Packet
1008
+ psincos_double(const Packet& x) {
1009
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
1010
+ typedef typename unpacket_traits<PacketI>::type ScalarI;
768
1011
 
769
- // Computes the principal sqrt of the complex numbers in the input.
770
- //
771
- // For example, for packets containing 2 complex numbers stored in interleaved format
772
- // a = [a0, a1] = [x0, y0, x1, y1],
773
- // where x0 = real(a0), y0 = imag(a0) etc., this function returns
774
- // b = [b0, b1] = [u0, v0, u1, v1],
775
- // such that b0^2 = a0, b1^2 = a1.
776
- //
777
- // To derive the formula for the complex square roots, let's consider the equation for
778
- // a single complex square root of the number x + i*y. We want to find real numbers
779
- // u and v such that
780
- // (u + i*v)^2 = x + i*y <=>
781
- // u^2 - v^2 + i*2*u*v = x + i*v.
782
- // By equating the real and imaginary parts we get:
783
- // u^2 - v^2 = x
784
- // 2*u*v = y.
785
- //
786
- // For x >= 0, this has the numerically stable solution
787
- // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
788
- // v = 0.5 * (y / u)
789
- // and for x < 0,
1012
+ const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
1013
+
1014
+ // If the argument is smaller than this value, use a simpler argument reduction
1015
+ const double small_th = 15;
1016
+ // If the argument is bigger than this value, use the non-vectorized std version
1017
+ const double huge_th = 1e14;
1018
+
1019
+ const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006); // 2/PI
1020
+ // Integer Packet constants
1021
+ const PacketI cst_one = pset1<PacketI>(ScalarI(1));
1022
+ // Constant for splitting
1023
+ const Packet cst_split = pset1<Packet>(1 << 24);
1024
+
1025
+ Packet x_abs = pabs(x);
1026
+
1027
+ // Scale x by 2/Pi
1028
+ PacketI q_int;
1029
+ Packet s;
1030
+
1031
+ // TODO Implement huge angle argument reduction
1032
+ if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
1033
+ Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
1034
+ Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
1035
+ q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
1036
+ Packet q_low = pcast<PacketI, Packet>(q_int);
1037
+ s = trig_reduce_medium_double(x_abs, q_high, q_low);
1038
+ } else {
1039
+ Packet qval_noround = pmul(x_abs, cst_2oPI);
1040
+ q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
1041
+ Packet q = pcast<PacketI, Packet>(q_int);
1042
+ s = trig_reduce_small_double(x_abs, q);
1043
+ }
1044
+
1045
+ // All the upcoming approximating polynomials have even exponents
1046
+ Packet ss = pmul(s, s);
1047
+
1048
+ // Padé approximant of cos(x)
1049
+ // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
1050
+ // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
1051
+ // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
1052
+ // MATLAB code to compute those coefficients:
1053
+ // syms x;
1054
+ // cosf = @(x) cos(x);
1055
+ // pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
1056
+ Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
1057
+ Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
1058
+ Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
1059
+ Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
1060
+ Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
1061
+ Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
1062
+ Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
1063
+ Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
1064
+ Packet scos = pdiv(sc4_num, sc4_denum);
1065
+
1066
+ // Padé approximant of sin(x)
1067
+ // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
1068
+ // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
1069
+ // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
1070
+ // MATLAB code to compute those coefficients:
1071
+ // syms x;
1072
+ // sinf = @(x) sin(x);
1073
+ // pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
1074
+ Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
1075
+ Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
1076
+ Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
1077
+ Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
1078
+ Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
1079
+ Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
1080
+ Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
1081
+ Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
1082
+ Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
1083
+
1084
+ Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
1085
+
1086
+ Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
1087
+ Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
1088
+ Packet sign_bit, sFinalRes;
1089
+ if (ComputeBoth) {
1090
+ Packet peven = peven_mask(x);
1091
+ sign_bit = pselect((s), sign_sin, sign_cos);
1092
+ sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
1093
+ } else {
1094
+ sign_bit = ComputeSine ? sign_sin : sign_cos;
1095
+ sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
1096
+ }
1097
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
1098
+ sFinalRes = pxor(sFinalRes, sign_bit);
1099
+
1100
+ // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
1101
+ // using std::sin and std::cos
1102
+ // TODO Remove it when huge angle argument reduction is implemented
1103
+ if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
1104
+ const int PacketSize = unpacket_traits<Packet>::size;
1105
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
1106
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
1107
+ pstoreu(x_cpy, x);
1108
+ pstoreu(sincos_vals, sFinalRes);
1109
+ for (int k = 0; k < PacketSize; ++k) {
1110
+ double val = x_cpy[k];
1111
+ if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
1112
+ if (ComputeBoth)
1113
+ sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
1114
+ else
1115
+ sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
1116
+ }
1117
+ }
1118
+ sFinalRes = ploadu<Packet>(sincos_vals);
1119
+ }
1120
+ return sFinalRes;
1121
+ }
1122
+
1123
+ template <typename Packet>
1124
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
1125
+ return psincos_double<true>(x);
1126
+ }
1127
+
1128
+ template <typename Packet>
1129
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
1130
+ return psincos_double<false>(x);
1131
+ }
1132
+
1133
+ // Generic implementation of acos(x).
1134
+ template <typename Packet>
1135
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
1136
+ typedef typename unpacket_traits<Packet>::type Scalar;
1137
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
1138
+
1139
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1140
+ const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
1141
+ const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
1142
+ const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
1143
+ const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
1144
+ const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
1145
+ const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
1146
+ const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
1147
+ const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
1148
+
1149
+ // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
1150
+ // function, by a 6'th order polynomial.
1151
+ // For x in [-1:0) we use that acos(-x) = pi - acos(x).
1152
+ const Packet neg_mask = psignbit(x_in);
1153
+ const Packet abs_x = pabs(x_in);
1154
+
1155
+ // Evaluate the polynomial using Horner's rule:
1156
+ // P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
1157
+ // We evaluate even and odd terms independently to increase
1158
+ // instruction level parallelism.
1159
+ Packet x2 = pmul(x_in, x_in);
1160
+ Packet p_even = pmadd(p6, x2, p4);
1161
+ Packet p_odd = pmadd(p5, x2, p3);
1162
+ p_even = pmadd(p_even, x2, p2);
1163
+ p_odd = pmadd(p_odd, x2, p1);
1164
+ p_even = pmadd(p_even, x2, p0);
1165
+ Packet p = pmadd(p_odd, abs_x, p_even);
1166
+
1167
+ // The polynomial approximates acos(x)/sqrt(1-x), so
1168
+ // multiply by sqrt(1-x) to get acos(x).
1169
+ // Conveniently returns NaN for arguments outside [-1:1].
1170
+ Packet denom = psqrt(psub(cst_one, abs_x));
1171
+ Packet result = pmul(denom, p);
1172
+ // Undo mapping for negative arguments.
1173
+ return pselect(neg_mask, psub(cst_pi, result), result);
1174
+ }
1175
+
1176
+ // Generic implementation of asin(x).
1177
+ template <typename Packet>
1178
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
1179
+ typedef typename unpacket_traits<Packet>::type Scalar;
1180
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
1181
+
1182
+ constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
1183
+
1184
+ const Packet cst_half = pset1<Packet>(0.5f);
1185
+ const Packet cst_one = pset1<Packet>(1.0f);
1186
+ const Packet cst_two = pset1<Packet>(2.0f);
1187
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
1188
+
1189
+ const Packet abs_x = pabs(x_in);
1190
+ const Packet sign_mask = pandnot(x_in, abs_x);
1191
+ const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
1192
+
1193
+ // For arguments |x| > 0.5, we map x back to [0:0.5] using
1194
+ // the transformation x_large = sqrt(0.5*(1-x)), and use the
1195
+ // identity
1196
+ // asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
1197
+
1198
+ const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
1199
+ const Packet large_mask = pcmp_lt(cst_half, abs_x);
1200
+ const Packet x = pselect(large_mask, x_large, abs_x);
1201
+ const Packet x2 = pmul(x, x);
1202
+
1203
+ // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
1204
+ // even terms only.
1205
+ constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
1206
+ 7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
1207
+ Packet p = ppolevl<Packet, 4>::run(x2, alpha);
1208
+ p = pmul(p, x);
1209
+
1210
+ const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
1211
+ p = pselect(large_mask, p_large, p);
1212
+ // Flip the sign for negative arguments.
1213
+ p = pxor(p, sign_mask);
1214
+ // Return NaN for arguments outside [-1:1].
1215
+ return por(invalid_mask, p);
1216
+ }
1217
+
1218
+ template <typename Scalar>
1219
+ struct patan_reduced {
1220
+ template <typename Packet>
1221
+ static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
1222
+ };
1223
+
1224
+ template <>
1225
+ template <typename Packet>
1226
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
1227
+ constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
1228
+ 3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
1229
+ 3.3004361289279920e-01};
1230
+
1231
+ constexpr double beta[] = {
1232
+ 2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
1233
+ 9.3705509168587852e-01, 3.3004361289279920e-01};
1234
+
1235
+ Packet x2 = pmul(x, x);
1236
+ Packet p = ppolevl<Packet, 6>::run(x2, alpha);
1237
+ Packet q = ppolevl<Packet, 6>::run(x2, beta);
1238
+ return pmul(x, pdiv(p, q));
1239
+ }
1240
+
1241
+ // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
1242
+ template <>
1243
+ template <typename Packet>
1244
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
1245
+ constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
1246
+
1247
+ constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
1248
+ 8.109951019287109375e-01f};
1249
+
1250
+ Packet x2 = pmul(x, x);
1251
+ Packet p = ppolevl<Packet, 2>::run(x2, alpha);
1252
+ Packet q = ppolevl<Packet, 3>::run(x2, beta);
1253
+ return pmul(x, pdiv(p, q));
1254
+ }
1255
+
1256
+ template <typename Packet>
1257
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
1258
+ typedef typename unpacket_traits<Packet>::type Scalar;
1259
+
1260
+ constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
1261
+
1262
+ const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
1263
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1264
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
1265
+
1266
+ // "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
1267
+ // "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
1268
+ // calculated using Rminimax.
1269
+
1270
+ const Packet abs_x = pabs(x_in);
1271
+ const Packet x_signmask = pand(x_in, cst_signmask);
1272
+ const Packet large_mask = pcmp_lt(cst_one, abs_x);
1273
+ const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
1274
+ const Packet p = patan_reduced<Scalar>::run(x);
1275
+ // Apply transformations according to the range reduction masks.
1276
+ Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
1277
+ // Return correct sign
1278
+ return pxor(result, x_signmask);
1279
+ }
1280
+
1281
+ /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
1282
+ Doesn't do anything fancy, just a 9/8-degree rational interpolant which
1283
+ is accurate up to a couple of ulps in the (approximate) range [-8, 8],
1284
+ outside of which tanh(x) = +/-1 in single precision. The input is clamped
1285
+ to the range [-c, c]. The value c is chosen as the smallest value where
1286
+ the approximation evaluates to exactly 1.
1287
+
1288
+ This implementation works on both scalars and packets.
1289
+ */
1290
+ template <typename T>
1291
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
1292
+ // Clamp the inputs to the range [-c, c] and set everything
1293
+ // outside that range to 1.0. The value c is chosen as the smallest
1294
+ // floating point argument such that the approximation is exactly 1.
1295
+ // This saves clamping the value at the end.
1296
+ #ifdef EIGEN_VECTORIZE_FMA
1297
+ const T plus_clamp = pset1<T>(8.01773357391357422f);
1298
+ const T minus_clamp = pset1<T>(-8.01773357391357422f);
1299
+ #else
1300
+ const T plus_clamp = pset1<T>(7.90738964080810547f);
1301
+ const T minus_clamp = pset1<T>(-7.90738964080810547f);
1302
+ #endif
1303
+ const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
1304
+
1305
+ // The following rational approximation was generated by rminimax
1306
+ // (https://gitlab.inria.fr/sfilip/rminimax) using the following
1307
+ // command:
1308
+ // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
1309
+ // --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
1310
+ // --output=tanhf.sollya --dispCoeff="dec"
1311
+
1312
+ // The monomial coefficients of the numerator polynomial (odd).
1313
+ constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
1314
+
1315
+ // The monomial coefficients of the denominator polynomial (even).
1316
+ constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
1317
+
1318
+ // Since the polynomials are odd/even, we need x^2.
1319
+ const T x2 = pmul(x, x);
1320
+ const T x3 = pmul(x2, x);
1321
+
1322
+ T p = ppolevl<T, 3>::run(x2, alpha);
1323
+ T q = ppolevl<T, 4>::run(x2, beta);
1324
+ // Take advantage of the fact that the constant term in p is 1 to compute
1325
+ // x*(x^2*p + 1) = x^3 * p + x.
1326
+ p = pmadd(x3, p, x);
1327
+
1328
+ // Divide the numerator by the denominator.
1329
+ return pdiv(p, q);
1330
+ }
1331
+
1332
+ /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
1333
+ This uses a 19/18-degree rational interpolant which
1334
+ is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
1335
+ outside of which tanh(x) = +/-1 in single precision. The input is clamped
1336
+ to the range [-c, c]. The value c is chosen as the smallest value where
1337
+ the approximation evaluates to exactly 1.
1338
+
1339
+ This implementation works on both scalars and packets.
1340
+ */
1341
+ template <typename T>
1342
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
1343
+ // Clamp the inputs to the range [-c, c] and set everything
1344
+ // outside that range to 1.0. The value c is chosen as the smallest
1345
+ // floating point argument such that the approximation is exactly 1.
1346
+ // This saves clamping the value at the end.
1347
+ #ifdef EIGEN_VECTORIZE_FMA
1348
+ const T plus_clamp = pset1<T>(17.6610191624600077);
1349
+ const T minus_clamp = pset1<T>(-17.6610191624600077);
1350
+ #else
1351
+ const T plus_clamp = pset1<T>(17.714196154005176);
1352
+ const T minus_clamp = pset1<T>(-17.714196154005176);
1353
+ #endif
1354
+ const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
1355
+
1356
+ // The following rational approximation was generated by rminimax
1357
+ // (https://gitlab.inria.fr/sfilip/rminimax) using the following
1358
+ // command:
1359
+ // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
1360
+ // --num="odd" --den="even" --type="[19,18]" --numF="[D]"
1361
+ // --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
1362
+
1363
+ // The monomial coefficients of the numerator polynomial (odd).
1364
+ constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
1365
+ 4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
1366
+ 9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
1367
+
1368
+ // The monomial coefficients of the denominator polynomial (even).
1369
+ constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
1370
+ 1.293019623712687916e-13, 1.123643448069621992e-10,
1371
+ 4.492975677839633985e-08, 8.785185266237658698e-06,
1372
+ 8.295161192716231542e-04, 3.437448108450402717e-02,
1373
+ 4.851805297361760360e-01, 1.0};
1374
+
1375
+ // Since the polynomials are odd/even, we need x^2.
1376
+ const T x2 = pmul(x, x);
1377
+ const T x3 = pmul(x2, x);
1378
+
1379
+ // Interleave the evaluation of the numerator polynomial p and
1380
+ // denominator polynomial q.
1381
+ T p = ppolevl<T, 8>::run(x2, alpha);
1382
+ T q = ppolevl<T, 9>::run(x2, beta);
1383
+ // Take advantage of the fact that the constant term in p is 1 to compute
1384
+ // x*(x^2*p + 1) = x^3 * p + x.
1385
+ p = pmadd(x3, p, x);
1386
+
1387
+ // Divide the numerator by the denominator.
1388
+ return pdiv(p, q);
1389
+ }
1390
+
1391
+ template <typename Packet>
1392
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
1393
+ typedef typename unpacket_traits<Packet>::type Scalar;
1394
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
1395
+
1396
+ // For |x| in [0:0.5] we use a polynomial approximation of the form
1397
+ // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
1398
+ constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
1399
+ 0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
1400
+ const Packet x2 = pmul(x, x);
1401
+ const Packet x3 = pmul(x, x2);
1402
+ Packet p = ppolevl<Packet, 4>::run(x2, alpha);
1403
+ p = pmadd(x3, p, x);
1404
+
1405
+ // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
1406
+ const Packet half = pset1<Packet>(0.5f);
1407
+ const Packet one = pset1<Packet>(1.0f);
1408
+ Packet r = pdiv(padd(one, x), psub(one, x));
1409
+ r = pmul(half, plog(r));
1410
+
1411
+ const Packet x_gt_half = pcmp_le(half, pabs(x));
1412
+ const Packet x_eq_one = pcmp_eq(one, pabs(x));
1413
+ const Packet x_gt_one = pcmp_lt(one, pabs(x));
1414
+ const Packet sign_mask = pset1<Packet>(-0.0f);
1415
+ const Packet x_sign = pand(sign_mask, x);
1416
+ const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
1417
+ return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
1418
+ }
1419
+
1420
+ template <typename Packet>
1421
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
1422
+ typedef typename unpacket_traits<Packet>::type Scalar;
1423
+ static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
1424
+ // For x in [-0.5:0.5] we use a rational approximation of the form
1425
+ // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
1426
+ constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
1427
+ -2.5949536095445679e-01, 1.2306328729812676e-01};
1428
+
1429
+ constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02, -4.2828141436397615e-01,
1430
+ 9.8733495886883648e-01, -1.0000000000000000e+00, 3.6918986189438030e-01};
1431
+
1432
+ const Packet x2 = pmul(x, x);
1433
+ const Packet x3 = pmul(x, x2);
1434
+ Packet p = ppolevl<Packet, 4>::run(x2, alpha);
1435
+ Packet q = ppolevl<Packet, 5>::run(x2, beta);
1436
+ Packet y_small = pmadd(x3, pdiv(p, q), x);
1437
+
1438
+ // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
1439
+ const Packet half = pset1<Packet>(0.5);
1440
+ const Packet one = pset1<Packet>(1.0);
1441
+ Packet y_large = pdiv(padd(one, x), psub(one, x));
1442
+ y_large = pmul(half, plog(y_large));
1443
+
1444
+ const Packet x_gt_half = pcmp_le(half, pabs(x));
1445
+ const Packet x_eq_one = pcmp_eq(one, pabs(x));
1446
+ const Packet x_gt_one = pcmp_lt(one, pabs(x));
1447
+ const Packet sign_mask = pset1<Packet>(-0.0);
1448
+ const Packet x_sign = pand(sign_mask, x);
1449
+ const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
1450
+ return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
1451
+ }
1452
+
1453
+ template <typename Packet>
1454
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
1455
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1456
+ // In the following we annotate the code for the case where the inputs
1457
+ // are a pair length-2 SIMD vectors representing a single pair of complex
1458
+ // numbers x = a + i*b, y = c + i*d.
1459
+ const RealPacket y_abs = pabs(y.v); // |c|, |d|
1460
+ const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
1461
+ const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
1462
+ const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|)
1463
+ // Compute scaled denominator.
1464
+ const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
1465
+ const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
1466
+ Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c
1467
+ // Divide elementwise by denom.
1468
+ result_scaled = Packet(pdiv(result_scaled.v, denom));
1469
+ // Rescale result
1470
+ return Packet(pdiv(result_scaled.v, y_max));
1471
+ }
1472
+
1473
+ template <typename Packet>
1474
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
1475
+ typedef typename unpacket_traits<Packet>::type Scalar;
1476
+ typedef typename Scalar::value_type RealScalar;
1477
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1478
+
1479
+ RealPacket real_mask_rp = peven_mask(x.v);
1480
+ Packet real_mask(real_mask_rp);
1481
+
1482
+ // Real part
1483
+ RealPacket x_flip = pcplxflip(x).v; // b, a
1484
+ Packet x_norm = phypot_complex(x); // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
1485
+ RealPacket xlogr = plog(x_norm.v); // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
1486
+
1487
+ // Imag part
1488
+ RealPacket ximg = patan2(x.v, x_flip); // atan2(a, b), atan2(b, a)
1489
+
1490
+ const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
1491
+ RealPacket x_abs = pabs(x.v);
1492
+ RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
1493
+ RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
1494
+ RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
1495
+ RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
1496
+
1497
+ Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg)); // log(sqrt(a^2 + b^2)), atan2(b, a)
1498
+ return xres;
1499
+ }
1500
+
1501
+ template <typename Packet>
1502
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
1503
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1504
+ typedef typename unpacket_traits<Packet>::type Scalar;
1505
+ typedef typename Scalar::value_type RealScalar;
1506
+ const RealPacket even_mask = peven_mask(a.v);
1507
+ const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
1508
+
1509
+ // Let a = x + iy.
1510
+ // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
1511
+
1512
+ // exp(x):
1513
+ RealPacket x = pand(a.v, even_mask);
1514
+ x = por(x, pcplxflip(Packet(x)).v);
1515
+ RealPacket expx = pexp(x); // exp(x);
1516
+
1517
+ // cis(y):
1518
+ RealPacket y = pand(odd_mask, a.v);
1519
+ y = por(y, pcplxflip(Packet(y)).v);
1520
+ RealPacket cisy = psincos_float<false, RealPacket, true>(y);
1521
+ cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y)
1522
+
1523
+ const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
1524
+ const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
1525
+
1526
+ // If x is -inf, we know that cossin(y) is bounded,
1527
+ // so the result is (0, +/-0), where the sign of the imaginary part comes
1528
+ // from the sign of cossin(y).
1529
+ RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
1530
+ cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
1531
+
1532
+ // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
1533
+ // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
1534
+ RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
1535
+ cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
1536
+ Packet result = Packet(pmul(expx, cisy));
1537
+
1538
+ // If y is +/- 0, the input is real, so take the real result for consistency.
1539
+ result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
1540
+
1541
+ return result;
1542
+ }
1543
+
1544
+ template <typename Packet>
1545
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
1546
+ typedef typename unpacket_traits<Packet>::type Scalar;
1547
+ typedef typename Scalar::value_type RealScalar;
1548
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1549
+
1550
+ // Computes the principal sqrt of the complex numbers in the input.
1551
+ //
1552
+ // For example, for packets containing 2 complex numbers stored in interleaved format
1553
+ // a = [a0, a1] = [x0, y0, x1, y1],
1554
+ // where x0 = real(a0), y0 = imag(a0) etc., this function returns
1555
+ // b = [b0, b1] = [u0, v0, u1, v1],
1556
+ // such that b0^2 = a0, b1^2 = a1.
1557
+ //
1558
+ // To derive the formula for the complex square roots, let's consider the equation for
1559
+ // a single complex square root of the number x + i*y. We want to find real numbers
1560
+ // u and v such that
1561
+ // (u + i*v)^2 = x + i*y <=>
1562
+ // u^2 - v^2 + i*2*u*v = x + i*v.
1563
+ // By equating the real and imaginary parts we get:
1564
+ // u^2 - v^2 = x
1565
+ // 2*u*v = y.
1566
+ //
1567
+ // For x >= 0, this has the numerically stable solution
1568
+ // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
1569
+ // v = 0.5 * (y / u)
1570
+ // and for x < 0,
790
1571
  // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
791
1572
  // u = 0.5 * (y / v)
792
1573
  //
@@ -802,14 +1583,14 @@ Packet psqrt_complex(const Packet& a) {
802
1583
  // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
803
1584
  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
804
1585
 
805
- RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
806
- RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
1586
+ RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
1587
+ RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
807
1588
  RealPacket a_max = pmax(a_abs, a_abs_flip);
808
1589
  RealPacket a_min = pmin(a_abs, a_abs_flip);
809
1590
  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
810
1591
  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
811
1592
  RealPacket r = pdiv(a_min, a_max);
812
- const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
1593
+ const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
813
1594
  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
814
1595
  // Set l to a_max if a_min is zero.
815
1596
  l = pselect(a_min_zero_mask, a_max, l);
@@ -832,8 +1613,7 @@ Packet psqrt_complex(const Packet& a) {
832
1613
 
833
1614
  // Step 4. Compute solution for inputs with negative real part:
834
1615
  // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
835
- const RealScalar neg_zero = RealScalar(numext::bit_cast<float>(0x80000000u));
836
- const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), neg_zero)).v;
1616
+ const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
837
1617
  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
838
1618
  Packet negative_real_result;
839
1619
  // Notice that rho is positive, so taking it's absolute value is a noop.
@@ -866,11 +1646,135 @@ Packet psqrt_complex(const Packet& a) {
866
1646
  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
867
1647
  Packet imag_inf_result;
868
1648
  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
1649
+ // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
1650
+ Packet result_is_nan = pisnan(result);
1651
+ result = por(result_is_nan, result);
869
1652
 
870
- return pselect(is_imag_inf, imag_inf_result,
871
- pselect(is_real_inf, real_inf_result,result));
1653
+ return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
872
1654
  }
873
1655
 
1656
+ // \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
1657
+ // Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
1658
+ template <typename Packet>
1659
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
1660
+ typedef typename unpacket_traits<Packet>::type Scalar;
1661
+ typedef typename Scalar::value_type RealScalar;
1662
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1663
+
1664
+ const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
1665
+ const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
1666
+ const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
1667
+ const RealPacket evenmask = peven_mask(a.v);
1668
+
1669
+ RealPacket a_abs = pabs(a.v);
1670
+ RealPacket a_flip = pcplxflip(Packet(a_abs)).v; // |b|, |a|
1671
+ RealPacket a_all = pselect(evenmask, a_abs, a_flip); // |a|, |a|
1672
+ RealPacket b_all = pselect(evenmask, a_flip, a_abs); // |b|, |b|
1673
+
1674
+ RealPacket a2 = pmul(a.v, a.v); // |a^2, b^2|
1675
+ RealPacket a2_flip = pcplxflip(Packet(a2)).v; // |b^2, a^2|
1676
+ RealPacket h = psqrt(padd(a2, a2_flip)); // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
1677
+ RealPacket h_sq = pmul(h, h); // |a^2 + b^2, a^2 + b^2|
1678
+ RealPacket a_sq = pselect(evenmask, a2, a2_flip); // |a^2, a^2|
1679
+ RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
1680
+ RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
1681
+ RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
1682
+ h = psub(h, pdiv(x, pmul(cst_two_rp, h))); // |h - x/(2*h), h - x/(2*h)|
1683
+
1684
+ // handle zero-case
1685
+ RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
1686
+
1687
+ h = pandnot(h, iszero); // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
1688
+ return Packet(h); // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
1689
+ }
1690
+
1691
+ template <typename Packet>
1692
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1693
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1694
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1695
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1696
+ using Scalar = typename unpacket_traits<Packet>::type;
1697
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1698
+ const Packet cst_zero = pzero(a);
1699
+
1700
+ const Packet abs_a = pabs(a);
1701
+ const Packet sign_mask = pandnot(a, abs_a);
1702
+ const Packet nonzero_mask = pcmp_lt(cst_zero, abs_a);
1703
+
1704
+ return pselect(nonzero_mask, por(sign_mask, cst_one), abs_a);
1705
+ }
1706
+ };
1707
+
1708
+ template <typename Packet>
1709
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1710
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1711
+ NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
1712
+ NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1713
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1714
+ using Scalar = typename unpacket_traits<Packet>::type;
1715
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1716
+ const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
1717
+ const Packet cst_zero = pzero(a);
1718
+
1719
+ const Packet positive_mask = pcmp_lt(cst_zero, a);
1720
+ const Packet positive = pand(positive_mask, cst_one);
1721
+ const Packet negative_mask = pcmp_lt(a, cst_zero);
1722
+ const Packet negative = pand(negative_mask, cst_minus_one);
1723
+
1724
+ return por(positive, negative);
1725
+ }
1726
+ };
1727
+
1728
+ template <typename Packet>
1729
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1730
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1731
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
1732
+ NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
1733
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1734
+ using Scalar = typename unpacket_traits<Packet>::type;
1735
+ const Packet cst_one = pset1<Packet>(Scalar(1));
1736
+ const Packet cst_zero = pzero(a);
1737
+
1738
+ const Packet zero_mask = pcmp_eq(cst_zero, a);
1739
+ return pandnot(cst_one, zero_mask);
1740
+ }
1741
+ };
1742
+
1743
+ // \internal \returns the the sign of a complex number z, defined as z / abs(z).
1744
+ template <typename Packet>
1745
+ struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
1746
+ NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
1747
+ unpacket_traits<Packet>::vectorizable>> {
1748
+ static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
1749
+ typedef typename unpacket_traits<Packet>::type Scalar;
1750
+ typedef typename Scalar::value_type RealScalar;
1751
+ typedef typename unpacket_traits<Packet>::as_real RealPacket;
1752
+
1753
+ // Step 1. Compute (for each element z = x + i*y in a)
1754
+ // l = abs(z) = sqrt(x^2 + y^2).
1755
+ // To avoid over- and underflow, we use the stable formula for each hypotenuse
1756
+ // l = (zmin == 0 ? zmax : zmax * sqrt(1 + (zmin/zmax)**2)),
1757
+ // where zmax = max(|x|, |y|), zmin = min(|x|, |y|),
1758
+ RealPacket a_abs = pabs(a.v);
1759
+ RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;
1760
+ RealPacket a_max = pmax(a_abs, a_abs_flip);
1761
+ RealPacket a_min = pmin(a_abs, a_abs_flip);
1762
+ RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
1763
+ RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
1764
+ RealPacket r = pdiv(a_min, a_max);
1765
+ const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
1766
+ RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
1767
+ // Set l to a_max if a_min is zero, since the roundtrip sqrt(a_max^2) may be
1768
+ // lossy.
1769
+ l = pselect(a_min_zero_mask, a_max, l);
1770
+ // Step 2 compute a / abs(a).
1771
+ RealPacket sign_as_real = pandnot(pdiv(a.v, l), a_max_zero_mask);
1772
+ Packet sign;
1773
+ sign.v = sign_as_real;
1774
+ return sign;
1775
+ }
1776
+ };
1777
+
874
1778
  // TODO(rmlarsen): The following set of utilities for double word arithmetic
875
1779
  // should perhaps be refactored as a separate file, since it would be generally
876
1780
  // useful for special function implementation etc. Writing the algorithms in
@@ -878,34 +1782,37 @@ Packet psqrt_complex(const Packet& a) {
878
1782
 
879
1783
  // This function splits x into the nearest integer n and fractional part r,
880
1784
  // such that x = n + r holds exactly.
881
- template<typename Packet>
882
- EIGEN_STRONG_INLINE
883
- void absolute_split(const Packet& x, Packet& n, Packet& r) {
1785
+ template <typename Packet>
1786
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
884
1787
  n = pround(x);
885
1788
  r = psub(x, n);
886
1789
  }
887
1790
 
888
1791
  // This function computes the sum {s, r}, such that x + y = s_hi + s_lo
889
1792
  // holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
890
- template<typename Packet>
891
- EIGEN_STRONG_INLINE
892
- void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
1793
+ template <typename Packet>
1794
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
893
1795
  s_hi = padd(x, y);
894
1796
  const Packet t = psub(s_hi, x);
895
1797
  s_lo = psub(y, t);
896
1798
  }
897
1799
 
898
- #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1800
+ #ifdef EIGEN_VECTORIZE_FMA
899
1801
  // This function implements the extended precision product of
900
1802
  // a pair of floating point numbers. Given {x, y}, it computes the pair
901
1803
  // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
902
1804
  // p_hi = fl(x * y).
903
- template<typename Packet>
904
- EIGEN_STRONG_INLINE
905
- void twoprod(const Packet& x, const Packet& y,
906
- Packet& p_hi, Packet& p_lo) {
1805
+ template <typename Packet>
1806
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
907
1807
  p_hi = pmul(x, y);
908
- p_lo = pmadd(x, y, pnegate(p_hi));
1808
+ p_lo = pmsub(x, y, p_hi);
1809
+ }
1810
+
1811
+ // A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
1812
+ // x * y = xy + p_lo holds exactly.
1813
+ template <typename Packet>
1814
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
1815
+ return pmsub(x, y, xy);
909
1816
  }
910
1817
 
911
1818
  #else
@@ -915,11 +1822,10 @@ void twoprod(const Packet& x, const Packet& y,
915
1822
  // exactly and that half of the significant of x fits in x_hi.
916
1823
  // This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
917
1824
  // 3rd edition, Birkh\"auser, 2016.
918
- template<typename Packet>
919
- EIGEN_STRONG_INLINE
920
- void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
1825
+ template <typename Packet>
1826
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
921
1827
  typedef typename unpacket_traits<Packet>::type Scalar;
922
- EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
1828
+ constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
923
1829
  const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr.
924
1830
  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
925
1831
  Packet rho = psub(x, gamma);
@@ -931,10 +1837,8 @@ void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
931
1837
  // Given floating point numbers {x, y} computes the pair
932
1838
  // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
933
1839
  // p_hi = fl(x * y).
934
- template<typename Packet>
935
- EIGEN_STRONG_INLINE
936
- void twoprod(const Packet& x, const Packet& y,
937
- Packet& p_hi, Packet& p_lo) {
1840
+ template <typename Packet>
1841
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
938
1842
  Packet x_hi, x_lo, y_hi, y_lo;
939
1843
  veltkamp_splitting(x, x_hi, x_lo);
940
1844
  veltkamp_splitting(y, y_hi, y_lo);
@@ -946,8 +1850,22 @@ void twoprod(const Packet& x, const Packet& y,
946
1850
  p_lo = pmadd(x_lo, y_lo, p_lo);
947
1851
  }
948
1852
 
949
- #endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1853
+ // A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
1854
+ // x * y = xy + p_lo holds exactly.
1855
+ template <typename Packet>
1856
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
1857
+ Packet x_hi, x_lo, y_hi, y_lo;
1858
+ veltkamp_splitting(x, x_hi, x_lo);
1859
+ veltkamp_splitting(y, y_hi, y_lo);
950
1860
 
1861
+ Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
1862
+ p_lo = pmadd(x_hi, y_lo, p_lo);
1863
+ p_lo = pmadd(x_lo, y_hi, p_lo);
1864
+ p_lo = pmadd(x_lo, y_lo, p_lo);
1865
+ return p_lo;
1866
+ }
1867
+
1868
+ #endif // EIGEN_VECTORIZE_FMA
951
1869
 
952
1870
  // This function implements Dekker's algorithm for the addition
953
1871
  // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
@@ -955,16 +1873,14 @@ void twoprod(const Packet& x, const Packet& y,
955
1873
  // x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
956
1874
  // This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
957
1875
  // 3rd edition, Birkh\"auser, 2016.
958
- template<typename Packet>
959
- EIGEN_STRONG_INLINE
960
- void twosum(const Packet& x_hi, const Packet& x_lo,
961
- const Packet& y_hi, const Packet& y_lo,
962
- Packet& s_hi, Packet& s_lo) {
1876
+ template <typename Packet>
1877
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
1878
+ const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
963
1879
  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
964
1880
  Packet r_hi_1, r_lo_1;
965
- fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
1881
+ fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
966
1882
  Packet r_hi_2, r_lo_2;
967
- fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
1883
+ fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
968
1884
  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
969
1885
 
970
1886
  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
@@ -976,11 +1892,9 @@ EIGEN_STRONG_INLINE
976
1892
 
977
1893
  // This is a version of twosum for double word numbers,
978
1894
  // which assumes that |x_hi| >= |y_hi|.
979
- template<typename Packet>
980
- EIGEN_STRONG_INLINE
981
- void fast_twosum(const Packet& x_hi, const Packet& x_lo,
982
- const Packet& y_hi, const Packet& y_lo,
983
- Packet& s_hi, Packet& s_lo) {
1895
+ template <typename Packet>
1896
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
1897
+ const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
984
1898
  Packet r_hi, r_lo;
985
1899
  fast_twosum(x_hi, y_hi, r_hi, r_lo);
986
1900
  const Packet s = padd(padd(y_lo, r_lo), x_lo);
@@ -990,11 +1904,9 @@ EIGEN_STRONG_INLINE
990
1904
  // This is a version of twosum for adding a floating point number x to
991
1905
  // double word number {y_hi, y_lo} number, with the assumption
992
1906
  // that |x| >= |y_hi|.
993
- template<typename Packet>
994
- EIGEN_STRONG_INLINE
995
- void fast_twosum(const Packet& x,
996
- const Packet& y_hi, const Packet& y_lo,
997
- Packet& s_hi, Packet& s_lo) {
1907
+ template <typename Packet>
1908
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
1909
+ Packet& s_hi, Packet& s_lo) {
998
1910
  Packet r_hi, r_lo;
999
1911
  fast_twosum(x, y_hi, r_hi, r_lo);
1000
1912
  const Packet s = padd(y_lo, r_lo);
@@ -1009,10 +1921,9 @@ void fast_twosum(const Packet& x,
1009
1921
  // in the floating point type.
1010
1922
  // This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
1011
1923
  // 3rd edition, Birkh\"auser, 2016.
1012
- template<typename Packet>
1013
- EIGEN_STRONG_INLINE
1014
- void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1015
- Packet& p_hi, Packet& p_lo) {
1924
+ template <typename Packet>
1925
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1926
+ Packet& p_hi, Packet& p_lo) {
1016
1927
  Packet c_hi, c_lo1;
1017
1928
  twoprod(x_hi, y, c_hi, c_lo1);
1018
1929
  const Packet c_lo2 = pmul(x_lo, y);
@@ -1028,11 +1939,9 @@ void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1028
1939
  // (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
1029
1940
  // of less than 2*2^{-2p}, where p is the number of significand bit
1030
1941
  // in the floating point type.
1031
- template<typename Packet>
1032
- EIGEN_STRONG_INLINE
1033
- void twoprod(const Packet& x_hi, const Packet& x_lo,
1034
- const Packet& y_hi, const Packet& y_lo,
1035
- Packet& p_hi, Packet& p_lo) {
1942
+ template <typename Packet>
1943
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
1944
+ const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
1036
1945
  Packet p_hi_hi, p_hi_lo;
1037
1946
  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
1038
1947
  Packet p_lo_hi, p_lo_lo;
@@ -1040,120 +1949,81 @@ void twoprod(const Packet& x_hi, const Packet& x_lo,
1040
1949
  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
1041
1950
  }
1042
1951
 
1043
- // This function computes the reciprocal of a floating point number
1044
- // with extra precision and returns the result as a double word.
1952
+ // This function implements the division of double word {x_hi, x_lo}
1953
+ // by float y. This is Algorithm 15 from "Tight and rigorous error bounds
1954
+ // for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
1955
+ // 2017. https://hal.archives-ouvertes.fr/hal-01351529
1045
1956
  template <typename Packet>
1046
- void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) {
1047
- typedef typename unpacket_traits<Packet>::type Scalar;
1048
- // 1. Approximate the reciprocal as the reciprocal of the high order element.
1049
- Packet approx_recip = prsqrt(x);
1050
- approx_recip = pmul(approx_recip, approx_recip);
1051
-
1052
- // 2. Run one step of Newton-Raphson iteration in double word arithmetic
1053
- // to get the bottom half. The NR iteration for reciprocal of 'a' is
1054
- // x_{i+1} = x_i * (2 - a * x_i)
1055
-
1056
- // -a*x_i
1057
- Packet t1_hi, t1_lo;
1058
- twoprod(pnegate(x), approx_recip, t1_hi, t1_lo);
1059
- // 2 - a*x_i
1060
- Packet t2_hi, t2_lo;
1061
- fast_twosum(pset1<Packet>(Scalar(2)), t1_hi, t2_hi, t2_lo);
1062
- Packet t3_hi, t3_lo;
1063
- fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo);
1064
- // x_i * (2 - a * x_i)
1065
- twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo);
1957
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
1958
+ Packet& z_hi, Packet& z_lo) {
1959
+ const Packet t_hi = pdiv(x_hi, y);
1960
+ Packet pi_hi, pi_lo;
1961
+ twoprod(t_hi, y, pi_hi, pi_lo);
1962
+ const Packet delta_hi = psub(x_hi, pi_hi);
1963
+ const Packet delta_t = psub(delta_hi, pi_lo);
1964
+ const Packet delta = padd(delta_t, x_lo);
1965
+ const Packet t_lo = pdiv(delta, y);
1966
+ fast_twosum(t_hi, t_lo, z_hi, z_lo);
1066
1967
  }
1067
1968
 
1068
-
1069
1969
  // This function computes log2(x) and returns the result as a double word.
1070
1970
  template <typename Scalar>
1071
1971
  struct accurate_log2 {
1072
1972
  template <typename Packet>
1073
- EIGEN_STRONG_INLINE
1074
- void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
1973
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
1075
1974
  log2_x_hi = plog2(x);
1076
1975
  log2_x_lo = pzero(x);
1077
1976
  }
1078
1977
  };
1079
1978
 
1080
1979
  // This specialization uses a more accurate algorithm to compute log2(x) for
1081
- // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
1980
+ // floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
1082
1981
  // This additional accuracy is needed to counter the error-magnification
1083
1982
  // inherent in multiplying by a potentially large exponent in pow(x,y).
1084
- // The minimax polynomial used was calculated using the Sollya tool.
1085
- // See sollya.org.
1983
+ // The minimax polynomial used was calculated using the Rminimax tool,
1984
+ // see https://gitlab.inria.fr/sfilip/rminimax.
1985
+ // Command line:
1986
+ // $ ratapprox --function="log2(1+x)/x" --dom='[-0.2929,0.41422]'
1987
+ // --type=[10,0]
1988
+ // --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
1989
+ //
1990
+ // The resulting implementation of pow(x,y) is accurate to 3 ulps.
1086
1991
  template <>
1087
1992
  struct accurate_log2<float> {
1088
1993
  template <typename Packet>
1089
- EIGEN_STRONG_INLINE
1090
- void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
1091
- // The function log(1+x)/x is approximated in the interval
1092
- // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
1093
- // Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
1094
- // where the degree 6 polynomial P(x) is evaluated in single precision,
1095
- // while the remaining 4 terms of Q(x), as well as the final multiplication by x
1096
- // to reconstruct log(1+x) are evaluated in extra precision using
1097
- // double word arithmetic. C0 through C3 are extra precise constants
1098
- // stored as double words.
1099
- //
1100
- // The polynomial coefficients were calculated using Sollya commands:
1101
- // > n = 10;
1102
- // > f = log2(1+x)/x;
1103
- // > interval = [sqrt(0.5)-1;sqrt(2)-1];
1104
- // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
1105
-
1106
- const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
1107
- const Packet p5 = pset1<Packet>(-0.1690667718648f);
1108
- const Packet p4 = pset1<Packet>( 0.1720575392246f);
1109
- const Packet p3 = pset1<Packet>(-0.1789081543684f);
1110
- const Packet p2 = pset1<Packet>( 0.2050433009862f);
1111
- const Packet p1 = pset1<Packet>(-0.2404672354459f);
1112
- const Packet p0 = pset1<Packet>( 0.2885761857032f);
1113
-
1114
- const Packet C3_hi = pset1<Packet>(-0.360674142838f);
1115
- const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
1116
- const Packet C2_hi = pset1<Packet>(0.480897903442f);
1117
- const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
1118
- const Packet C1_hi = pset1<Packet>(-0.721347510815f);
1119
- const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
1120
- const Packet C0_hi = pset1<Packet>(1.44269502163f);
1121
- const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
1994
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
1995
+ // Split the two lowest order constant coefficient into double-word representation.
1996
+ constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
1997
+ constexpr float kC0_hi = static_cast<float>(kC0);
1998
+ constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
1999
+ const Packet c0_hi = pset1<Packet>(kC0_hi);
2000
+ const Packet c0_lo = pset1<Packet>(kC0_lo);
2001
+
2002
+ constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
2003
+ constexpr float kC1_hi = static_cast<float>(kC1);
2004
+ constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
2005
+ const Packet c1_hi = pset1<Packet>(kC1_hi);
2006
+ const Packet c1_lo = pset1<Packet>(kC1_lo);
2007
+
2008
+ constexpr float c[] = {
2009
+ 9.7010828554630279541015625e-02, -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
2010
+ -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01, -2.4046677350997924804687500e-01,
2011
+ 2.8857553005218505859375000e-01, -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
2012
+
2013
+ // Evaluate the higher order terms in the polynomial using
2014
+ // standard arithmetic.
1122
2015
  const Packet one = pset1<Packet>(1.0f);
1123
-
1124
2016
  const Packet x = psub(z, one);
1125
- // Evaluate P(x) in working precision.
1126
- // We evaluate it in multiple parts to improve instruction level
1127
- // parallelism.
1128
- Packet x2 = pmul(x,x);
1129
- Packet p_even = pmadd(p6, x2, p4);
1130
- p_even = pmadd(p_even, x2, p2);
1131
- p_even = pmadd(p_even, x2, p0);
1132
- Packet p_odd = pmadd(p5, x2, p3);
1133
- p_odd = pmadd(p_odd, x2, p1);
1134
- Packet p = pmadd(p_odd, x, p_even);
1135
-
1136
- // Now evaluate the low-order tems of Q(x) in double word precision.
1137
- // In the following, due to the alternating signs and the fact that
1138
- // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
1139
- // fast_twosum instead of the slower twosum.
1140
- Packet q_hi, q_lo;
1141
- Packet t_hi, t_lo;
1142
- // C3 + x * p(x)
1143
- twoprod(p, x, t_hi, t_lo);
1144
- fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
1145
- // C2 + x * p(x)
1146
- twoprod(q_hi, q_lo, x, t_hi, t_lo);
1147
- fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
1148
- // C1 + x * p(x)
1149
- twoprod(q_hi, q_lo, x, t_hi, t_lo);
1150
- fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
1151
- // C0 + x * p(x)
1152
- twoprod(q_hi, q_lo, x, t_hi, t_lo);
1153
- fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
1154
-
1155
- // log(z) ~= x * Q(x)
1156
- twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
2017
+ Packet p = ppolevl<Packet, 8>::run(x, c);
2018
+ // Evaluate the final two step in Horner's rule using double-word
2019
+ // arithmetic.
2020
+ Packet p_hi, p_lo;
2021
+ twoprod(x, p, p_hi, p_lo);
2022
+ fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
2023
+ twoprod(p_hi, p_lo, x, p_hi, p_lo);
2024
+ fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
2025
+ // Multiply by x to recover log2(z).
2026
+ twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
1157
2027
  }
1158
2028
  };
1159
2029
 
@@ -1167,8 +2037,7 @@ struct accurate_log2<float> {
1167
2037
  template <>
1168
2038
  struct accurate_log2<double> {
1169
2039
  template <typename Packet>
1170
- EIGEN_STRONG_INLINE
1171
- void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
2040
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
1172
2041
  // We use a transformation of variables:
1173
2042
  // r = c * (x-1) / (x+1),
1174
2043
  // such that
@@ -1204,16 +2073,13 @@ struct accurate_log2<double> {
1204
2073
  const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
1205
2074
  const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
1206
2075
  // c * (x - 1)
1207
- Packet num_hi, num_lo;
1208
- twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo);
1209
- // TODO(rmlarsen): Investigate if using the division algorithm by
1210
- // Muller et al. is faster/more accurate.
1211
- // 1 / (x + 1)
1212
- Packet denom_hi, denom_lo;
1213
- doubleword_reciprocal(padd(x, one), denom_hi, denom_lo);
1214
- // r = c * (x-1) / (x+1),
2076
+ Packet t_hi, t_lo;
2077
+ // t = c * (x-1)
2078
+ twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
2079
+ // r = c * (x-1) / (x+1),
1215
2080
  Packet r_hi, r_lo;
1216
- twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo);
2081
+ doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
2082
+
1217
2083
  // r2 = r * r
1218
2084
  Packet r2_hi, r2_lo;
1219
2085
  twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
@@ -1252,157 +2118,20 @@ struct accurate_log2<double> {
1252
2118
  }
1253
2119
  };
1254
2120
 
1255
- // This function computes exp2(x) (i.e. 2**x).
1256
- template <typename Scalar>
1257
- struct fast_accurate_exp2 {
1258
- template <typename Packet>
1259
- EIGEN_STRONG_INLINE
1260
- Packet operator()(const Packet& x) {
1261
- // TODO(rmlarsen): Add a pexp2 packetop.
1262
- return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
1263
- }
1264
- };
1265
-
1266
- // This specialization uses a faster algorithm to compute exp2(x) for floats
1267
- // in [-0.5;0.5] with a relative accuracy of 1 ulp.
1268
- // The minimax polynomial used was calculated using the Sollya tool.
1269
- // See sollya.org.
1270
- template <>
1271
- struct fast_accurate_exp2<float> {
1272
- template <typename Packet>
1273
- EIGEN_STRONG_INLINE
1274
- Packet operator()(const Packet& x) {
1275
- // This function approximates exp2(x) by a degree 6 polynomial of the form
1276
- // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
1277
- // single precision, and the remaining steps are evaluated with extra precision using
1278
- // double word arithmetic. C is an extra precise constant stored as a double word.
1279
- //
1280
- // The polynomial coefficients were calculated using Sollya commands:
1281
- // > n = 6;
1282
- // > f = 2^x;
1283
- // > interval = [-0.5;0.5];
1284
- // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
1285
-
1286
- const Packet p4 = pset1<Packet>(1.539513905e-4f);
1287
- const Packet p3 = pset1<Packet>(1.340007293e-3f);
1288
- const Packet p2 = pset1<Packet>(9.618283249e-3f);
1289
- const Packet p1 = pset1<Packet>(5.550328270e-2f);
1290
- const Packet p0 = pset1<Packet>(0.2402264923f);
1291
-
1292
- const Packet C_hi = pset1<Packet>(0.6931471825f);
1293
- const Packet C_lo = pset1<Packet>(2.36836577e-08f);
1294
- const Packet one = pset1<Packet>(1.0f);
1295
-
1296
- // Evaluate P(x) in working precision.
1297
- // We evaluate even and odd parts of the polynomial separately
1298
- // to gain some instruction level parallelism.
1299
- Packet x2 = pmul(x,x);
1300
- Packet p_even = pmadd(p4, x2, p2);
1301
- Packet p_odd = pmadd(p3, x2, p1);
1302
- p_even = pmadd(p_even, x2, p0);
1303
- Packet p = pmadd(p_odd, x, p_even);
1304
-
1305
- // Evaluate the remaining terms of Q(x) with extra precision using
1306
- // double word arithmetic.
1307
- Packet p_hi, p_lo;
1308
- // x * p(x)
1309
- twoprod(p, x, p_hi, p_lo);
1310
- // C + x * p(x)
1311
- Packet q1_hi, q1_lo;
1312
- twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
1313
- // x * (C + x * p(x))
1314
- Packet q2_hi, q2_lo;
1315
- twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
1316
- // 1 + x * (C + x * p(x))
1317
- Packet q3_hi, q3_lo;
1318
- // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
1319
- // for adding it to unity here.
1320
- fast_twosum(one, q2_hi, q3_hi, q3_lo);
1321
- return padd(q3_hi, padd(q2_lo, q3_lo));
1322
- }
1323
- };
1324
-
1325
- // in [-0.5;0.5] with a relative accuracy of 1 ulp.
1326
- // The minimax polynomial used was calculated using the Sollya tool.
1327
- // See sollya.org.
1328
- template <>
1329
- struct fast_accurate_exp2<double> {
1330
- template <typename Packet>
1331
- EIGEN_STRONG_INLINE
1332
- Packet operator()(const Packet& x) {
1333
- // This function approximates exp2(x) by a degree 10 polynomial of the form
1334
- // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
1335
- // single precision, and the remaining steps are evaluated with extra precision using
1336
- // double word arithmetic. C is an extra precise constant stored as a double word.
1337
- //
1338
- // The polynomial coefficients were calculated using Sollya commands:
1339
- // > n = 11;
1340
- // > f = 2^x;
1341
- // > interval = [-0.5;0.5];
1342
- // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating);
1343
-
1344
- const Packet p9 = pset1<Packet>(4.431642109085495276e-10);
1345
- const Packet p8 = pset1<Packet>(7.073829923303358410e-9);
1346
- const Packet p7 = pset1<Packet>(1.017822306737031311e-7);
1347
- const Packet p6 = pset1<Packet>(1.321543498017646657e-6);
1348
- const Packet p5 = pset1<Packet>(1.525273342728892877e-5);
1349
- const Packet p4 = pset1<Packet>(1.540353045780084423e-4);
1350
- const Packet p3 = pset1<Packet>(1.333355814685869807e-3);
1351
- const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
1352
- const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
1353
- const Packet p0 = pset1<Packet>(0.240226506959101332);
1354
- const Packet C_hi = pset1<Packet>(0.693147180559945286);
1355
- const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
1356
- const Packet one = pset1<Packet>(1.0);
1357
-
1358
- // Evaluate P(x) in working precision.
1359
- // We evaluate even and odd parts of the polynomial separately
1360
- // to gain some instruction level parallelism.
1361
- Packet x2 = pmul(x,x);
1362
- Packet p_even = pmadd(p8, x2, p6);
1363
- Packet p_odd = pmadd(p9, x2, p7);
1364
- p_even = pmadd(p_even, x2, p4);
1365
- p_odd = pmadd(p_odd, x2, p5);
1366
- p_even = pmadd(p_even, x2, p2);
1367
- p_odd = pmadd(p_odd, x2, p3);
1368
- p_even = pmadd(p_even, x2, p0);
1369
- p_odd = pmadd(p_odd, x2, p1);
1370
- Packet p = pmadd(p_odd, x, p_even);
1371
-
1372
- // Evaluate the remaining terms of Q(x) with extra precision using
1373
- // double word arithmetic.
1374
- Packet p_hi, p_lo;
1375
- // x * p(x)
1376
- twoprod(p, x, p_hi, p_lo);
1377
- // C + x * p(x)
1378
- Packet q1_hi, q1_lo;
1379
- twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
1380
- // x * (C + x * p(x))
1381
- Packet q2_hi, q2_lo;
1382
- twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
1383
- // 1 + x * (C + x * p(x))
1384
- Packet q3_hi, q3_lo;
1385
- // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
1386
- // for adding it to unity here.
1387
- fast_twosum(one, q2_hi, q3_hi, q3_lo);
1388
- return padd(q3_hi, padd(q2_lo, q3_lo));
1389
- }
1390
- };
1391
-
1392
2121
  // This function implements the non-trivial case of pow(x,y) where x is
1393
2122
  // positive and y is (possibly) non-integer.
1394
2123
  // Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
1395
2124
  // TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
1396
2125
  // easier to specialize or turn off for specific types and/or backends.x
1397
2126
  template <typename Packet>
1398
- EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
2127
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
1399
2128
  typedef typename unpacket_traits<Packet>::type Scalar;
1400
2129
  // Split x into exponent e_x and mantissa m_x.
1401
2130
  Packet e_x;
1402
2131
  Packet m_x = pfrexp(x, e_x);
1403
2132
 
1404
2133
  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
1405
- EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
2134
+ constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
1406
2135
  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
1407
2136
  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
1408
2137
  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
@@ -1435,215 +2164,471 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
1435
2164
 
1436
2165
  // We now have an accurate split of f = n_z + r_z and can compute
1437
2166
  // x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
1438
- // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy
1439
- // using a specialized algorithm. Multiplication by the second factor can
1440
- // be done exactly using pldexp(), since it is an integer power of 2.
1441
- const Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
1442
- return pldexp(e_r, n_z);
2167
+ // Multiplication by the second factor can be done exactly using pldexp(), since
2168
+ // it is an integer power of 2.
2169
+ const Packet e_r = generic_exp2(r_z);
2170
+
2171
+ // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
2172
+ // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
2173
+ constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
2174
+ const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
2175
+ if (predux_any(pldexp_fast_unsafe)) {
2176
+ return pldexp(e_r, n_z);
2177
+ }
2178
+ return pldexp_fast(e_r, n_z);
1443
2179
  }
1444
2180
 
1445
2181
  // Generic implementation of pow(x,y).
1446
- template<typename Packet>
1447
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
1448
- EIGEN_UNUSED
1449
- Packet generic_pow(const Packet& x, const Packet& y) {
2182
+ template <typename Packet>
2183
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
2184
+ const Packet& x, const Packet& y) {
1450
2185
  typedef typename unpacket_traits<Packet>::type Scalar;
1451
2186
 
1452
- const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
2187
+ const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
1453
2188
  const Packet cst_zero = pset1<Packet>(Scalar(0));
1454
2189
  const Packet cst_one = pset1<Packet>(Scalar(1));
1455
2190
  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
1456
2191
 
1457
- const Packet abs_x = pabs(x);
2192
+ const Packet x_abs = pabs(x);
2193
+ Packet pow = generic_pow_impl(x_abs, y);
2194
+
2195
+ // In the following we enforce the special case handling prescribed in
2196
+ // https://en.cppreference.com/w/cpp/numeric/math/pow.
2197
+
1458
2198
  // Predicates for sign and magnitude of x.
2199
+ const Packet x_is_negative = pcmp_lt(x, cst_zero);
1459
2200
  const Packet x_is_zero = pcmp_eq(x, cst_zero);
1460
- const Packet x_is_neg = pcmp_lt(x, cst_zero);
1461
- const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
1462
- const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
1463
- const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
1464
- const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
1465
- const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
1466
- const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
1467
- const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
2201
+ const Packet x_is_one = pcmp_eq(x, cst_one);
2202
+ const Packet x_has_signbit = psignbit(x);
2203
+ const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
2204
+ const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
1468
2205
 
1469
2206
  // Predicates for sign and magnitude of y.
1470
- const Packet y_is_one = pcmp_eq(y, cst_one);
2207
+ const Packet y_abs = pabs(y);
2208
+ const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
2209
+ const Packet y_is_negative = pcmp_lt(y, cst_zero);
1471
2210
  const Packet y_is_zero = pcmp_eq(y, cst_zero);
1472
- const Packet y_is_neg = pcmp_lt(y, cst_zero);
1473
- const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
1474
- const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
1475
- const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
1476
- EIGEN_CONSTEXPR Scalar huge_exponent =
1477
- (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
1478
- NumTraits<Scalar>::epsilon();
1479
- const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
1480
-
1481
- // Predicates for whether y is integer and/or even.
1482
- const Packet y_is_int = pcmp_eq(pfloor(y), y);
2211
+ const Packet y_is_one = pcmp_eq(y, cst_one);
2212
+ // Predicates for whether y is integer and odd/even.
2213
+ const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
1483
2214
  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
1484
2215
  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
2216
+ const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
2217
+ // Smallest exponent for which (1 + epsilon) overflows to infinity.
2218
+ constexpr Scalar huge_exponent =
2219
+ (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
2220
+ const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
2221
+
2222
+ // * pow(base, exp) returns NaN if base is finite and negative
2223
+ // and exp is finite and non-integer.
2224
+ pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
2225
+
2226
+ // * pow(±0, exp), where exp is negative, finite, and is an even integer or
2227
+ // a non-integer, returns +∞
2228
+ // * pow(±0, exp), where exp is positive non-integer or a positive even
2229
+ // integer, returns +0
2230
+ // * pow(+0, exp), where exp is a negative odd integer, returns +∞
2231
+ // * pow(-0, exp), where exp is a negative odd integer, returns -∞
2232
+ // * pow(+0, exp), where exp is a positive odd integer, returns +0
2233
+ // * pow(-0, exp), where exp is a positive odd integer, returns -0
2234
+ // Sign is flipped by the rule below.
2235
+ pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
2236
+
2237
+ // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
2238
+ // and exp is an odd integer exponent.
2239
+ pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
2240
+
2241
+ // * pow(base, -∞) returns +∞ for any |base|<1
2242
+ // * pow(base, -∞) returns +0 for any |base|>1
2243
+ // * pow(base, +∞) returns +0 for any |base|<1
2244
+ // * pow(base, +∞) returns +∞ for any |base|>1
2245
+ // * pow(±0, -∞) returns +∞
2246
+ // * pow(-1, +-∞) = 1
2247
+ Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
2248
+ inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
2249
+ pow = pselect(y_abs_is_huge, inf_y_val, pow);
2250
+
2251
+ // * pow(+∞, exp) returns +0 for any negative exp
2252
+ // * pow(+∞, exp) returns +∞ for any positive exp
2253
+ // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
2254
+ // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
2255
+ // even integer.
2256
+ // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
2257
+ // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
2258
+ // even integer.
2259
+ auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
2260
+ auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
2261
+ pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
2262
+
2263
+ // All cases of NaN inputs return NaN, except the two below.
2264
+ pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
2265
+
2266
+ // * pow(base, 1) returns base.
2267
+ // * pow(base, +/-0) returns 1, regardless of base, even NaN.
2268
+ // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
2269
+ pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
2270
+
2271
+ return pow;
2272
+ }
1485
2273
 
1486
- // Predicates encoding special cases for the value of pow(x,y)
1487
- const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf),
1488
- y_is_int),
1489
- abs_y_is_inf);
1490
- const Packet pow_is_one = por(por(x_is_one, y_is_zero),
1491
- pand(x_is_neg_one,
1492
- por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
1493
- const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
1494
- const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
1495
- pand(abs_x_is_inf, y_is_neg)),
1496
- pand(pand(abs_x_is_lt_one, abs_y_is_huge),
1497
- y_is_pos)),
1498
- pand(pand(abs_x_is_gt_one, abs_y_is_huge),
1499
- y_is_neg));
1500
- const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
1501
- pand(abs_x_is_inf, y_is_pos)),
1502
- pand(pand(abs_x_is_lt_one, abs_y_is_huge),
1503
- y_is_neg)),
1504
- pand(pand(abs_x_is_gt_one, abs_y_is_huge),
1505
- y_is_pos));
1506
-
1507
- // General computation of pow(x,y) for positive x or negative x and integer y.
1508
- const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
1509
- const Packet pow_abs = generic_pow_impl(abs_x, y);
1510
- return pselect(y_is_one, x,
1511
- pselect(pow_is_one, cst_one,
1512
- pselect(pow_is_nan, cst_nan,
1513
- pselect(pow_is_inf, cst_pos_inf,
1514
- pselect(pow_is_zero, cst_zero,
1515
- pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
2274
+ template <typename Scalar>
2275
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
2276
+ const Scalar& x, const Scalar& y) {
2277
+ return numext::pow(x, y);
1516
2278
  }
1517
2279
 
2280
+ namespace unary_pow {
1518
2281
 
2282
+ template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
2283
+ struct exponent_helper {
2284
+ using safe_abs_type = ScalarExponent;
2285
+ static constexpr ScalarExponent one_half = ScalarExponent(0.5);
2286
+ // these routines assume that exp is an integer stored as a floating point type
2287
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
2288
+ return numext::abs(exp);
2289
+ }
2290
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
2291
+ eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
2292
+ ScalarExponent exp_div_2 = exp * one_half;
2293
+ ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
2294
+ return exp_div_2 != floor_exp_div_2;
2295
+ }
2296
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
2297
+ ScalarExponent exp_div_2 = exp * one_half;
2298
+ return numext::floor(exp_div_2);
2299
+ }
2300
+ };
1519
2301
 
1520
- /* polevl (modified for Eigen)
1521
- *
1522
- * Evaluate polynomial
1523
- *
1524
- *
1525
- *
1526
- * SYNOPSIS:
1527
- *
1528
- * int N;
1529
- * Scalar x, y, coef[N+1];
1530
- *
1531
- * y = polevl<decltype(x), N>( x, coef);
1532
- *
1533
- *
1534
- *
1535
- * DESCRIPTION:
1536
- *
1537
- * Evaluates polynomial of degree N:
1538
- *
1539
- * 2 N
1540
- * y = C + C x + C x +...+ C x
1541
- * 0 1 2 N
1542
- *
1543
- * Coefficients are stored in reverse order:
1544
- *
1545
- * coef[0] = C , ..., coef[N] = C .
1546
- * N 0
1547
- *
1548
- * The function p1evl() assumes that coef[N] = 1.0 and is
1549
- * omitted from the array. Its calling arguments are
1550
- * otherwise the same as polevl().
1551
- *
1552
- *
1553
- * The Eigen implementation is templatized. For best speed, store
1554
- * coef as a const array (constexpr), e.g.
1555
- *
1556
- * const double coef[] = {1.0, 2.0, 3.0, ...};
1557
- *
1558
- */
1559
- template <typename Packet, int N>
1560
- struct ppolevl {
1561
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
1562
- EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
1563
- return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
2302
+ template <typename ScalarExponent>
2303
+ struct exponent_helper<ScalarExponent, true> {
2304
+ // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
2305
+ // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
2306
+ using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
2307
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
2308
+ ScalarExponent mask = numext::signbit(exp);
2309
+ safe_abs_type result = safe_abs_type(exp ^ mask);
2310
+ return result + safe_abs_type(ScalarExponent(1) & mask);
2311
+ }
2312
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
2313
+ return exp % safe_abs_type(2) != safe_abs_type(0);
2314
+ }
2315
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
2316
+ return exp >> safe_abs_type(1);
1564
2317
  }
1565
2318
  };
1566
2319
 
1567
- template <typename Packet>
1568
- struct ppolevl<Packet, 0> {
1569
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
1570
- EIGEN_UNUSED_VARIABLE(x);
1571
- return pset1<Packet>(coeff[0]);
2320
+ template <typename Packet, typename ScalarExponent,
2321
+ bool ReciprocateIfExponentIsNegative =
2322
+ !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
2323
+ struct reciprocate {
2324
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2325
+ using Scalar = typename unpacket_traits<Packet>::type;
2326
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2327
+ return exponent < 0 ? pdiv(cst_pos_one, x) : x;
1572
2328
  }
1573
2329
  };
1574
2330
 
1575
- /* chbevl (modified for Eigen)
1576
- *
1577
- * Evaluate Chebyshev series
1578
- *
1579
- *
1580
- *
1581
- * SYNOPSIS:
1582
- *
1583
- * int N;
1584
- * Scalar x, y, coef[N], chebevl();
1585
- *
1586
- * y = chbevl( x, coef, N );
1587
- *
1588
- *
1589
- *
1590
- * DESCRIPTION:
1591
- *
1592
- * Evaluates the series
1593
- *
1594
- * N-1
1595
- * - '
1596
- * y = > coef[i] T (x/2)
1597
- * - i
1598
- * i=0
1599
- *
1600
- * of Chebyshev polynomials Ti at argument x/2.
1601
- *
1602
- * Coefficients are stored in reverse order, i.e. the zero
1603
- * order term is last in the array. Note N is the number of
1604
- * coefficients, not the order.
1605
- *
1606
- * If coefficients are for the interval a to b, x must
1607
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
1608
- * entering the routine. This maps x from (a, b) to (-1, 1),
1609
- * over which the Chebyshev polynomials are defined.
1610
- *
1611
- * If the coefficients are for the inverted interval, in
1612
- * which (a, b) is mapped to (1/b, 1/a), the transformation
1613
- * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
1614
- * this becomes x -> 4a/x - 1.
1615
- *
1616
- *
1617
- *
1618
- * SPEED:
1619
- *
1620
- * Taking advantage of the recurrence properties of the
1621
- * Chebyshev polynomials, the routine requires one more
1622
- * addition per loop than evaluating a nested polynomial of
1623
- * the same degree.
1624
- *
1625
- */
2331
+ template <typename Packet, typename ScalarExponent>
2332
+ struct reciprocate<Packet, ScalarExponent, false> {
2333
+ // pdiv not defined, nor necessary for integer base types
2334
+ // if the exponent is unsigned, then the exponent cannot be negative
2335
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
2336
+ };
1626
2337
 
1627
- template <typename Packet, int N>
1628
- struct pchebevl {
1629
- EIGEN_DEVICE_FUNC
1630
- static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
1631
- typedef typename unpacket_traits<Packet>::type Scalar;
1632
- Packet b0 = pset1<Packet>(coef[0]);
1633
- Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
1634
- Packet b2;
2338
+ template <typename Packet, typename ScalarExponent>
2339
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
2340
+ using Scalar = typename unpacket_traits<Packet>::type;
2341
+ using ExponentHelper = exponent_helper<ScalarExponent>;
2342
+ using AbsExponentType = typename ExponentHelper::safe_abs_type;
2343
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2344
+ if (exponent == ScalarExponent(0)) return cst_pos_one;
2345
+
2346
+ Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
2347
+ Packet y = cst_pos_one;
2348
+ AbsExponentType m = ExponentHelper::safe_abs(exponent);
2349
+
2350
+ while (m > 1) {
2351
+ bool odd = ExponentHelper::is_odd(m);
2352
+ if (odd) y = pmul(y, result);
2353
+ result = pmul(result, result);
2354
+ m = ExponentHelper::floor_div_two(m);
2355
+ }
1635
2356
 
1636
- for (int i = 1; i < N; i++) {
1637
- b2 = b1;
1638
- b1 = b0;
1639
- b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
2357
+ return pmul(y, result);
2358
+ }
2359
+
2360
+ template <typename Packet>
2361
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
2362
+ const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
2363
+ const Packet exponent_packet = pset1<Packet>(exponent);
2364
+ return generic_pow_impl(x, exponent_packet);
2365
+ }
2366
+
2367
+ template <typename Scalar>
2368
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
2369
+ const Scalar& x, const Scalar& exponent) {
2370
+ return numext::pow(x, exponent);
2371
+ }
2372
+
2373
+ template <typename Packet, typename ScalarExponent>
2374
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
2375
+ const ScalarExponent& exponent) {
2376
+ using Scalar = typename unpacket_traits<Packet>::type;
2377
+
2378
+ // non-integer base and exponent case
2379
+ const Packet cst_pos_zero = pzero(x);
2380
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2381
+ const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
2382
+ const Packet cst_true = ptrue<Packet>(x);
2383
+
2384
+ const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
2385
+ const bool exponent_is_neg = exponent < ScalarExponent(0);
2386
+ const bool exponent_is_pos = exponent > ScalarExponent(0);
2387
+
2388
+ const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
2389
+ const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
2390
+ const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
2391
+ const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
2392
+ const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
2393
+
2394
+ const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
2395
+ const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
2396
+ const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
2397
+
2398
+ const Packet abs_x = pabs(x);
2399
+ const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
2400
+ const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
2401
+ const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
2402
+ const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
2403
+
2404
+ Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
2405
+ Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
2406
+ Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
2407
+
2408
+ Packet result = powx;
2409
+ result = por(x_is_le_zero, result);
2410
+ result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
2411
+ result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
2412
+ result = por(exp_is_nan, result);
2413
+ result = pselect(pow_is_one, cst_pos_one, result);
2414
+ return result;
2415
+ }
2416
+
2417
+ template <typename Packet, typename ScalarExponent,
2418
+ std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
2419
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
2420
+ using Scalar = typename unpacket_traits<Packet>::type;
2421
+
2422
+ // signed integer base, signed integer exponent case
2423
+
2424
+ // This routine handles negative exponents.
2425
+ // The return value is either 0, 1, or -1.
2426
+ const Packet cst_pos_one = pset1<Packet>(Scalar(1));
2427
+ const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
2428
+ const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
2429
+
2430
+ const Packet abs_x = pabs(x);
2431
+ const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
2432
+
2433
+ Packet result = pselect(exp_is_odd, x, abs_x);
2434
+ result = pselect(abs_x_is_one, result, pzero<Packet>(x));
2435
+ return result;
2436
+ }
2437
+
2438
+ template <typename Packet, typename ScalarExponent,
2439
+ std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
2440
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
2441
+ using Scalar = typename unpacket_traits<Packet>::type;
2442
+
2443
+ // unsigned integer base, signed integer exponent case
2444
+
2445
+ // This routine handles negative exponents.
2446
+ // The return value is either 0 or 1
2447
+
2448
+ const Scalar pos_one = Scalar(1);
2449
+
2450
+ const Packet cst_pos_one = pset1<Packet>(pos_one);
2451
+
2452
+ const Packet x_is_one = pcmp_eq(x, cst_pos_one);
2453
+
2454
+ return pand(x_is_one, x);
2455
+ }
2456
+
2457
+ } // end namespace unary_pow
2458
+
2459
+ template <typename Packet, typename ScalarExponent,
2460
+ bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
2461
+ bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
2462
+ bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
2463
+ struct unary_pow_impl;
2464
+
2465
+ template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
2466
+ struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
2467
+ typedef typename unpacket_traits<Packet>::type Scalar;
2468
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2469
+ const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
2470
+ if (exponent_is_integer) {
2471
+ // The simple recursive doubling implementation is only accurate to 3 ulps
2472
+ // for integer exponents in [-3:7]. Since this is a common case, we
2473
+ // specialize it here.
2474
+ bool use_repeated_squaring =
2475
+ (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
2476
+ return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
2477
+ } else {
2478
+ Packet result = unary_pow::gen_pow(x, exponent);
2479
+ result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
2480
+ return result;
1640
2481
  }
2482
+ }
2483
+ };
1641
2484
 
1642
- return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
2485
+ template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
2486
+ struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
2487
+ typedef typename unpacket_traits<Packet>::type Scalar;
2488
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2489
+ return unary_pow::int_pow(x, exponent);
1643
2490
  }
1644
2491
  };
1645
2492
 
1646
- } // end namespace internal
1647
- } // end namespace Eigen
2493
+ template <typename Packet, typename ScalarExponent>
2494
+ struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
2495
+ typedef typename unpacket_traits<Packet>::type Scalar;
2496
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2497
+ if (exponent < ScalarExponent(0)) {
2498
+ return unary_pow::handle_negative_exponent(x, exponent);
2499
+ } else {
2500
+ return unary_pow::int_pow(x, exponent);
2501
+ }
2502
+ }
2503
+ };
2504
+
2505
+ template <typename Packet, typename ScalarExponent>
2506
+ struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
2507
+ typedef typename unpacket_traits<Packet>::type Scalar;
2508
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
2509
+ return unary_pow::int_pow(x, exponent);
2510
+ }
2511
+ };
2512
+
2513
+ // This function computes exp2(x) = exp(ln(2) * x).
2514
+ // To improve accuracy, the product ln(2)*x is computed using the twoprod
2515
+ // algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
2516
+ // computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
2517
+ // correction step this reduces the maximum absolute error as follows:
2518
+ //
2519
+ // type | max error (simple product) | max error (twoprod) |
2520
+ // -----------------------------------------------------------
2521
+ // float | 35 ulps | 4 ulps |
2522
+ // double | 363 ulps | 110 ulps |
2523
+ //
2524
+ template <typename Packet>
2525
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
2526
+ typedef typename unpacket_traits<Packet>::type Scalar;
2527
+ constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
2528
+ constexpr int digits = std::numeric_limits<Scalar>::digits;
2529
+ constexpr Scalar max_cap = Scalar(max_exponent + 1);
2530
+ constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
2531
+ Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
2532
+ Packet p_hi, p_lo;
2533
+ twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
2534
+ Packet exp2_hi = pexp(p_hi);
2535
+ Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
2536
+ return pmul(exp2_hi, exp2_lo);
2537
+ }
2538
+
2539
+ template <typename Packet>
2540
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
2541
+ using Scalar = typename unpacket_traits<Packet>::type;
2542
+ using IntType = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
2543
+ // Adds and subtracts signum(a) * 2^kMantissaBits to force rounding.
2544
+ const IntType kLimit = IntType(1) << (NumTraits<Scalar>::digits() - 1);
2545
+ const Packet cst_limit = pset1<Packet>(static_cast<Scalar>(kLimit));
2546
+ Packet abs_a = pabs(a);
2547
+ Packet sign_a = pandnot(a, abs_a);
2548
+ Packet rint_a = padd(abs_a, cst_limit);
2549
+ // Don't compile-away addition and subtraction.
2550
+ EIGEN_OPTIMIZATION_BARRIER(rint_a);
2551
+ rint_a = psub(rint_a, cst_limit);
2552
+ rint_a = por(rint_a, sign_a);
2553
+ // If greater than limit (or NaN), simply return a.
2554
+ Packet mask = pcmp_lt(abs_a, cst_limit);
2555
+ Packet result = pselect(mask, rint_a, a);
2556
+ return result;
2557
+ }
2558
+
2559
+ template <typename Packet>
2560
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
2561
+ using Scalar = typename unpacket_traits<Packet>::type;
2562
+ const Packet cst_1 = pset1<Packet>(Scalar(1));
2563
+ Packet rint_a = generic_rint(a);
2564
+ // if a < rint(a), then rint(a) == ceil(a)
2565
+ Packet mask = pcmp_lt(a, rint_a);
2566
+ Packet offset = pand(cst_1, mask);
2567
+ Packet result = psub(rint_a, offset);
2568
+ return result;
2569
+ }
2570
+
2571
+ template <typename Packet>
2572
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
2573
+ using Scalar = typename unpacket_traits<Packet>::type;
2574
+ const Packet cst_1 = pset1<Packet>(Scalar(1));
2575
+ const Packet sign_mask = pset1<Packet>(static_cast<Scalar>(-0.0));
2576
+ Packet rint_a = generic_rint(a);
2577
+ // if rint(a) < a, then rint(a) == floor(a)
2578
+ Packet mask = pcmp_lt(rint_a, a);
2579
+ Packet offset = pand(cst_1, mask);
2580
+ Packet result = padd(rint_a, offset);
2581
+ // Signed zero must remain signed (e.g. ceil(-0.02) == -0).
2582
+ result = por(result, pand(sign_mask, a));
2583
+ return result;
2584
+ }
2585
+
2586
+ template <typename Packet>
2587
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
2588
+ Packet abs_a = pabs(a);
2589
+ Packet sign_a = pandnot(a, abs_a);
2590
+ Packet floor_abs_a = generic_floor(abs_a);
2591
+ Packet result = por(floor_abs_a, sign_a);
2592
+ return result;
2593
+ }
2594
+
2595
+ template <typename Packet>
2596
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
2597
+ using Scalar = typename unpacket_traits<Packet>::type;
2598
+ const Packet cst_half = pset1<Packet>(Scalar(0.5));
2599
+ const Packet cst_1 = pset1<Packet>(Scalar(1));
2600
+ Packet abs_a = pabs(a);
2601
+ Packet sign_a = pandnot(a, abs_a);
2602
+ Packet floor_abs_a = generic_floor(abs_a);
2603
+ Packet diff = psub(abs_a, floor_abs_a);
2604
+ Packet mask = pcmp_le(cst_half, diff);
2605
+ Packet offset = pand(cst_1, mask);
2606
+ Packet result = padd(floor_abs_a, offset);
2607
+ result = por(result, sign_a);
2608
+ return result;
2609
+ }
2610
+
2611
+ template <typename Packet>
2612
+ struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ false> {
2613
+ using Scalar = typename unpacket_traits<Packet>::type;
2614
+ static_assert(packet_traits<Scalar>::HasRound, "Generic nearest integer functions are disabled for this type.");
2615
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return generic_floor(x); }
2616
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return generic_ceil(x); }
2617
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return generic_rint(x); }
2618
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return generic_round(x); }
2619
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return generic_trunc(x); }
2620
+ };
2621
+
2622
+ template <typename Packet>
2623
+ struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ true> {
2624
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return x; }
2625
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return x; }
2626
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return x; }
2627
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return x; }
2628
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return x; }
2629
+ };
2630
+
2631
+ } // end namespace internal
2632
+ } // end namespace Eigen
1648
2633
 
1649
- #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
2634
+ #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H