tomoto 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/ct.cpp +11 -11
  5. data/ext/tomoto/dmr.cpp +14 -13
  6. data/ext/tomoto/dt.cpp +14 -14
  7. data/ext/tomoto/extconf.rb +7 -5
  8. data/ext/tomoto/gdmr.cpp +7 -7
  9. data/ext/tomoto/hdp.cpp +9 -9
  10. data/ext/tomoto/hlda.cpp +13 -13
  11. data/ext/tomoto/hpa.cpp +5 -5
  12. data/ext/tomoto/lda.cpp +42 -39
  13. data/ext/tomoto/llda.cpp +6 -6
  14. data/ext/tomoto/mglda.cpp +15 -15
  15. data/ext/tomoto/pa.cpp +6 -6
  16. data/ext/tomoto/plda.cpp +6 -6
  17. data/ext/tomoto/slda.cpp +8 -8
  18. data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
  19. data/ext/tomoto/utils.h +16 -70
  20. data/lib/tomoto/version.rb +1 -1
  21. data/lib/tomoto.rb +5 -1
  22. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  23. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  24. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  25. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  26. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  27. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  28. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  29. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  30. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  31. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  32. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  33. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  34. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  35. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  36. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  37. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  38. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  39. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  40. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  41. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  42. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  43. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  44. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  45. data/vendor/EigenRand/README.md +57 -4
  46. data/vendor/eigen/COPYING.APACHE +203 -0
  47. data/vendor/eigen/COPYING.BSD +1 -1
  48. data/vendor/eigen/COPYING.MINPACK +51 -52
  49. data/vendor/eigen/Eigen/Cholesky +0 -1
  50. data/vendor/eigen/Eigen/Core +112 -265
  51. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  52. data/vendor/eigen/Eigen/Geometry +5 -8
  53. data/vendor/eigen/Eigen/Householder +0 -1
  54. data/vendor/eigen/Eigen/Jacobi +0 -1
  55. data/vendor/eigen/Eigen/KLUSupport +41 -0
  56. data/vendor/eigen/Eigen/LU +2 -5
  57. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  58. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  59. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  60. data/vendor/eigen/Eigen/QR +2 -3
  61. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  62. data/vendor/eigen/Eigen/SVD +0 -1
  63. data/vendor/eigen/Eigen/Sparse +0 -2
  64. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  65. data/vendor/eigen/Eigen/SparseLU +4 -0
  66. data/vendor/eigen/Eigen/SparseQR +0 -1
  67. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  68. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  69. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  70. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  71. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  72. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  73. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  74. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  75. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  76. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  77. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  78. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  79. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  80. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  81. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  82. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  84. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  85. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  86. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  87. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  88. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  89. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  90. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  91. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  92. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  93. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  94. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  95. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  96. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  97. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  98. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  99. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  100. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  101. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  102. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  103. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  104. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  105. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  106. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  107. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  108. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  109. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  110. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  111. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  112. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  113. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  114. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  115. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  116. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  117. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  118. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  119. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  120. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  121. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  122. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  123. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  124. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  125. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  126. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  127. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  128. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  129. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  130. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  131. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  132. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  133. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  134. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  135. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  136. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  137. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  138. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  139. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  140. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  141. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  142. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  143. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  145. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  146. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  148. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  149. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  153. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  154. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  156. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  160. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  161. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  162. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  169. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  171. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  172. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  173. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  174. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  175. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  176. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  177. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  178. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  179. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  180. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  181. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  182. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  183. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  184. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  185. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  186. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  187. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  188. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  189. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  190. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  191. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  192. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  193. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  194. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  195. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  196. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  197. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  198. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  199. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  200. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  201. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  202. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  203. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  204. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  205. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  206. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  207. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  208. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  209. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  210. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  211. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  212. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  213. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  214. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  215. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  216. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  217. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  218. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  219. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  220. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  221. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  222. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  223. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  224. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  225. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  226. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  227. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  228. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  229. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  230. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  231. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  232. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  233. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  234. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  235. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  236. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  237. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  238. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  239. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  240. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  241. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  242. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  243. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  244. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  245. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  246. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  247. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  248. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  249. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  250. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  251. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  252. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  253. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  254. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  255. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  256. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  257. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  258. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  259. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  260. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  261. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  262. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  263. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  264. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  265. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  266. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  267. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  268. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  269. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  270. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  271. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  272. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  273. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  274. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  275. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  276. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  277. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  278. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  279. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  280. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  281. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  282. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  283. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  284. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  285. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  295. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  296. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  297. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  298. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  299. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  300. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  307. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  308. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  309. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  310. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  311. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  312. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  313. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  314. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  315. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  316. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  317. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  318. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  319. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  320. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  321. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  322. data/vendor/eigen/README.md +2 -0
  323. data/vendor/eigen/bench/btl/README +1 -1
  324. data/vendor/eigen/bench/tensors/README +6 -7
  325. data/vendor/eigen/ci/README.md +56 -0
  326. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  327. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  328. data/vendor/eigen/unsupported/README.txt +1 -1
  329. data/vendor/tomotopy/README.kr.rst +78 -0
  330. data/vendor/tomotopy/README.rst +75 -0
  331. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  332. data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
  333. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
  334. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
  335. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
  336. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  337. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  338. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
  339. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
  340. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
  341. data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
  342. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
  343. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
  344. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
  345. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
  346. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
  347. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
  348. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  349. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
  350. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
  351. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
  352. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
  353. data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
  354. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  355. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
  356. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
  357. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  358. data/vendor/tomotopy/src/Utils/math.h +2 -2
  359. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  360. data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
  361. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  362. metadata +64 -18
  363. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  364. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  365. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  366. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  367. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  368. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  369. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -2,10 +2,10 @@
2
2
  * @file MorePacketMath.h
3
3
  * @author bab2min (bab2min@gmail.com)
4
4
  * @brief
5
- * @version 0.3.0
6
- * @date 2020-10-07
5
+ * @version 0.3.3
6
+ * @date 2021-03-31
7
7
  *
8
- * @copyright Copyright (c) 2020
8
+ * @copyright Copyright (c) 2020-2021
9
9
  *
10
10
  */
11
11
 
@@ -14,14 +14,26 @@
14
14
 
15
15
  #include <Eigen/Dense>
16
16
 
17
+ #define EIGENRAND_PRINT_PACKET(p) do { using _MTy = typename std::remove_const<typename std::remove_reference<decltype(p)>::type>::type; typename std::conditional<Eigen::internal::IsFloatPacket<_MTy>::value, float, typename std::conditional<Eigen::internal::IsDoublePacket<_MTy>::value, double, int>::type>::type f[4]; Eigen::internal::pstore(f, p); std::cout << #p " " << f[0] << " " << f[1] << " " << f[2] << " " << f[3] << std::endl; } while(0)
18
+
17
19
  namespace Eigen
18
20
  {
19
21
  namespace internal
20
22
  {
23
+ template<typename Ty>
24
+ struct IsIntPacket : std::false_type {};
25
+
26
+ template<typename Ty>
27
+ struct IsFloatPacket : std::false_type {};
28
+
29
+ template<typename Ty>
30
+ struct IsDoublePacket : std::false_type {};
31
+
32
+ template<typename Ty>
33
+ struct HalfPacket;
34
+
21
35
  template<typename Packet>
22
- struct reinterpreter
23
- {
24
- };
36
+ struct reinterpreter{};
25
37
 
26
38
  template<typename Packet>
27
39
  inline auto reinterpret_to_float(const Packet& x)
@@ -44,13 +56,40 @@ namespace Eigen
44
56
  return reinterpreter<Packet>{}.to_int(x);
45
57
  }
46
58
 
59
+ template<typename Packet>
60
+ EIGEN_STRONG_INLINE void split_two(const Packet& p, typename HalfPacket<Packet>::type& a, typename HalfPacket<Packet>::type& b);
61
+
47
62
  template<typename Packet>
48
63
  EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
49
64
 
65
+ template<typename Packet>
66
+ EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b);
67
+
68
+ template<typename Packet>
69
+ EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b);
70
+
71
+ template <typename SrcPacket, typename TgtPacket>
72
+ EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a);
73
+
50
74
  template<typename Packet>
51
75
  EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b);
52
76
 
53
77
  template<typename Packet>
78
+ struct BitShifter {};
79
+
80
+ template<int b, typename Packet>
81
+ EIGEN_STRONG_INLINE Packet psll(const Packet& a);
82
+
83
+ template<int _b, typename Packet>
84
+ EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b = _b);
85
+
86
+ template<int b, typename Packet>
87
+ EIGEN_STRONG_INLINE Packet psll64(const Packet& a);
88
+
89
+ template<int b, typename Packet>
90
+ EIGEN_STRONG_INLINE Packet psrl64(const Packet& a);
91
+
92
+ /*template<typename Packet>
54
93
  EIGEN_STRONG_INLINE Packet psll(const Packet& a, int b);
55
94
 
56
95
  template<typename Packet>
@@ -60,12 +99,34 @@ namespace Eigen
60
99
  EIGEN_STRONG_INLINE Packet psll64(const Packet& a, int b);
61
100
 
62
101
  template<typename Packet>
63
- EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);
102
+ EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);*/
64
103
 
65
104
  template<typename Packet>
66
105
  EIGEN_STRONG_INLINE int pmovemask(const Packet& a);
67
106
 
68
- template<>
107
+ template<typename Packet>
108
+ EIGEN_STRONG_INLINE typename std::enable_if<
109
+ IsFloatPacket<Packet>::value, Packet
110
+ >::type pext_sign(const Packet& a)
111
+ {
112
+ using IntPacket = decltype(reinterpret_to_int(a));
113
+ return reinterpret_to_float(
114
+ pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
115
+ );
116
+ }
117
+
118
+ template<typename Packet>
119
+ EIGEN_STRONG_INLINE typename std::enable_if<
120
+ IsDoublePacket<Packet>::value, Packet
121
+ >::type pext_sign(const Packet& a)
122
+ {
123
+ using IntPacket = decltype(reinterpret_to_int(a));
124
+ return reinterpret_to_double(
125
+ pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
126
+ );
127
+ }
128
+
129
+ /*template<>
69
130
  EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(const uint64_t& a, int b)
70
131
  {
71
132
  return a << b;
@@ -75,109 +136,11 @@ namespace Eigen
75
136
  EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(const uint64_t& a, int b)
76
137
  {
77
138
  return a >> b;
78
- }
79
-
80
- template<typename Packet>
81
- EIGEN_STRONG_INLINE void psincos(Packet x, Packet &s, Packet &c)
82
- {
83
- Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
84
- using IntPacket = decltype(reinterpret_to_int(x));
85
- IntPacket emm0, emm2, emm4;
86
-
87
- sign_bit_sin = x;
88
- /* take the absolute value */
89
- x = pabs(x);
90
- /* extract the sign bit (upper one) */
91
- sign_bit_sin = reinterpret_to_float(
92
- pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
93
- );
94
-
95
- /* scale by 4/Pi */
96
- y = pmul(x, pset1<Packet>(1.27323954473516));
97
-
98
- /* store the integer part of y in emm2 */
99
- emm2 = pcast<Packet, IntPacket>(y);
100
-
101
- /* j=(j+1) & (~1) (see the cephes sources) */
102
- emm2 = padd(emm2, pset1<IntPacket>(1));
103
- emm2 = pand(emm2, pset1<IntPacket>(~1));
104
- y = pcast<IntPacket, Packet>(emm2);
105
-
106
- emm4 = emm2;
107
-
108
- /* get the swap sign flag for the sine */
109
- emm0 = pand(emm2, pset1<IntPacket>(4));
110
- emm0 = psll(emm0, 29);
111
- Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
112
-
113
- /* get the polynom selection mask for the sine*/
114
- emm2 = pand(emm2, pset1<IntPacket>(2));
115
-
116
- emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
117
- Packet poly_mask = reinterpret_to_float(emm2);
118
-
119
- /* The magic pass: "Extended precision modular arithmetic"
120
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
121
- xmm1 = pset1<Packet>(-0.78515625);
122
- xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
123
- xmm3 = pset1<Packet>(-3.77489497744594108e-8);
124
- xmm1 = pmul(y, xmm1);
125
- xmm2 = pmul(y, xmm2);
126
- xmm3 = pmul(y, xmm3);
127
- x = padd(x, xmm1);
128
- x = padd(x, xmm2);
129
- x = padd(x, xmm3);
130
-
131
- emm4 = psub(emm4, pset1<IntPacket>(2));
132
- emm4 = pandnot(emm4, pset1<IntPacket>(4));
133
- emm4 = psll(emm4, 29);
134
- Packet sign_bit_cos = reinterpret_to_float(emm4);
135
- sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
136
-
137
-
138
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
139
- Packet z = pmul(x, x);
140
- y = pset1<Packet>(2.443315711809948E-005);
141
-
142
- y = pmul(y, z);
143
- y = padd(y, pset1<Packet>(-1.388731625493765E-003));
144
- y = pmul(y, z);
145
- y = padd(y, pset1<Packet>(4.166664568298827E-002));
146
- y = pmul(y, z);
147
- y = pmul(y, z);
148
- Packet tmp = pmul(z, pset1<Packet>(0.5));
149
- y = psub(y, tmp);
150
- y = padd(y, pset1<Packet>(1));
151
-
152
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
153
-
154
- Packet y2 = pset1<Packet>(-1.9515295891E-4);
155
- y2 = pmul(y2, z);
156
- y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
157
- y2 = pmul(y2, z);
158
- y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
159
- y2 = pmul(y2, z);
160
- y2 = pmul(y2, x);
161
- y2 = padd(y2, x);
162
-
163
- /* select the correct result from the two polynoms */
164
- xmm3 = poly_mask;
165
- Packet ysin2 = pand(xmm3, y2);
166
- Packet ysin1 = pandnot(xmm3, y);
167
- y2 = psub(y2, ysin2);
168
- y = psub(y, ysin1);
169
-
170
- xmm1 = padd(ysin1, ysin2);
171
- xmm2 = padd(y, y2);
172
-
173
- /* update the sign */
174
- s = pxor(xmm1, sign_bit_sin);
175
- c = pxor(xmm2, sign_bit_cos);
176
- }
139
+ }*/
177
140
 
178
141
  // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
179
142
  template<typename Packet>
180
- EIGEN_STRONG_INLINE Packet plgamma(const Packet& x)
143
+ EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x)
181
144
  {
182
145
  auto x_3 = padd(x, pset1<Packet>(3));
183
146
  auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
@@ -195,6 +158,9 @@ namespace Eigen
195
158
  template<typename Packet>
196
159
  EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
197
160
 
161
+ template<typename Packet>
162
+ EIGEN_STRONG_INLINE Packet pbitnot(const Packet& a);
163
+
198
164
  template<typename PacketIf, typename Packet>
199
165
  EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
200
166
 
@@ -213,6 +179,9 @@ namespace Eigen
213
179
  template<typename Packet>
214
180
  EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
215
181
 
182
+ template<typename Packet>
183
+ EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b);
184
+
216
185
  template<typename Packet>
217
186
  EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
218
187
 
@@ -241,10 +210,10 @@ namespace Eigen
241
210
  }
242
211
 
243
212
  template<typename _Scalar>
244
- struct bit_scalar;
213
+ struct BitScalar;
245
214
 
246
215
  template<>
247
- struct bit_scalar<float>
216
+ struct BitScalar<float>
248
217
  {
249
218
  float to_ur(uint32_t x)
250
219
  {
@@ -264,7 +233,7 @@ namespace Eigen
264
233
  };
265
234
 
266
235
  template<>
267
- struct bit_scalar<double>
236
+ struct BitScalar<double>
268
237
  {
269
238
  double to_ur(uint64_t x)
270
239
  {
@@ -291,720 +260,359 @@ namespace Eigen
291
260
 
292
261
  EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
293
262
  {
294
- bit_scalar<float> bs;
263
+ BitScalar<float> bs;
295
264
  float2 ret;
296
265
  ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
297
266
  ret.f[1] = bs.to_ur(x >> 32);
298
267
  return ret;
299
268
  }
300
- }
301
- }
302
-
303
- #ifdef EIGEN_VECTORIZE_AVX
304
- #include <immintrin.h>
305
269
 
306
- namespace Eigen
307
- {
308
- namespace internal
309
- {
310
- template<>
311
- struct reinterpreter<Packet8i>
270
+ template<typename Packet>
271
+ EIGEN_STRONG_INLINE typename std::enable_if<
272
+ IsFloatPacket<Packet>::value
273
+ >::type psincos(Packet x, Packet& s, Packet& c)
312
274
  {
313
- EIGEN_STRONG_INLINE Packet8f to_float(const Packet8i& x)
314
- {
315
- return _mm256_castsi256_ps(x);
316
- }
317
-
318
- EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x)
319
- {
320
- return _mm256_castsi256_pd(x);
321
- }
275
+ Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
276
+ using IntPacket = decltype(reinterpret_to_int(x));
277
+ IntPacket emm0, emm2, emm4;
322
278
 
323
- EIGEN_STRONG_INLINE Packet8i to_int(const Packet8i& x)
324
- {
325
- return x;
326
- }
327
- };
279
+ sign_bit_sin = x;
280
+ /* take the absolute value */
281
+ x = pabs(x);
282
+ /* extract the sign bit (upper one) */
283
+ sign_bit_sin = pext_sign(sign_bit_sin);
328
284
 
329
- template<>
330
- struct reinterpreter<Packet8f>
331
- {
332
- EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x)
333
- {
334
- return x;
335
- }
285
+ /* scale by 4/Pi */
286
+ y = pmul(x, pset1<Packet>(1.27323954473516));
336
287
 
337
- EIGEN_STRONG_INLINE Packet4d to_double(const Packet8f& x)
338
- {
339
- return _mm256_castps_pd(x);
340
- }
288
+ /* store the integer part of y in emm2 */
289
+ emm2 = pcast<Packet, IntPacket>(y);
341
290
 
342
- EIGEN_STRONG_INLINE Packet8i to_int(const Packet8f& x)
343
- {
344
- return _mm256_castps_si256(x);
345
- }
346
- };
291
+ /* j=(j+1) & (~1) (see the cephes sources) */
292
+ emm2 = padd(emm2, pset1<IntPacket>(1));
293
+ emm2 = pand(emm2, pset1<IntPacket>(~1));
294
+ y = pcast<IntPacket, Packet>(emm2);
347
295
 
348
- template<>
349
- struct reinterpreter<Packet4d>
350
- {
351
- EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x)
352
- {
353
- return _mm256_castpd_ps(x);
354
- }
296
+ emm4 = emm2;
355
297
 
356
- EIGEN_STRONG_INLINE Packet4d to_double(const Packet4d& x)
357
- {
358
- return x;
359
- }
298
+ /* get the swap sign flag for the sine */
299
+ emm0 = pand(emm2, pset1<IntPacket>(4));
300
+ emm0 = psll<29>(emm0);
301
+ Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
360
302
 
361
- EIGEN_STRONG_INLINE Packet8i to_int(const Packet4d& x)
362
- {
363
- return _mm256_castpd_si256(x);
364
- }
365
- };
303
+ /* get the polynom selection mask for the sine*/
304
+ emm2 = pand(emm2, pset1<IntPacket>(2));
366
305
 
367
- EIGEN_STRONG_INLINE void split_two(const Packet8i& x, Packet4i& a, Packet4i& b)
368
- {
369
- a = _mm256_extractf128_si256(x, 0);
370
- b = _mm256_extractf128_si256(x, 1);
371
- }
306
+ emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
307
+ Packet poly_mask = reinterpret_to_float(emm2);
372
308
 
373
- EIGEN_STRONG_INLINE Packet8i combine_two(const Packet4i& a, const Packet4i& b)
374
- {
375
- return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
376
- }
309
+ /* The magic pass: "Extended precision modular arithmetic"
310
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
311
+ xmm1 = pset1<Packet>(-0.78515625);
312
+ xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
313
+ xmm3 = pset1<Packet>(-3.77489497744594108e-8);
314
+ xmm1 = pmul(y, xmm1);
315
+ xmm2 = pmul(y, xmm2);
316
+ xmm3 = pmul(y, xmm3);
317
+ x = padd(x, xmm1);
318
+ x = padd(x, xmm2);
319
+ x = padd(x, xmm3);
377
320
 
378
- EIGEN_STRONG_INLINE void split_two(const Packet8f& x, Packet4f& a, Packet4f& b)
379
- {
380
- a = _mm256_extractf128_ps(x, 0);
381
- b = _mm256_extractf128_ps(x, 1);
382
- }
321
+ emm4 = psub(emm4, pset1<IntPacket>(2));
322
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
323
+ emm4 = pandnot(pset1<IntPacket>(4), emm4);
324
+ #else
325
+ emm4 = pandnot(emm4, pset1<IntPacket>(4));
326
+ #endif
327
+ emm4 = psll<29>(emm4);
328
+ Packet sign_bit_cos = reinterpret_to_float(emm4);
329
+ sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
383
330
 
384
- EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b)
385
- {
386
- return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
387
- }
388
331
 
332
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
333
+ Packet z = pmul(x, x);
334
+ y = pset1<Packet>(2.443315711809948E-005);
389
335
 
390
- EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet8i& a)
391
- {
392
- #ifdef EIGEN_VECTORIZE_AVX2
393
- return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
394
- #else
395
- auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
396
- return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
397
- #endif
398
- }
336
+ y = pmul(y, z);
337
+ y = padd(y, pset1<Packet>(-1.388731625493765E-003));
338
+ y = pmul(y, z);
339
+ y = padd(y, pset1<Packet>(4.166664568298827E-002));
340
+ y = pmul(y, z);
341
+ y = pmul(y, z);
342
+ Packet tmp = pmul(z, pset1<Packet>(0.5));
343
+ y = psub(y, tmp);
344
+ y = padd(y, pset1<Packet>(1));
399
345
 
400
- template<>
401
- EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
402
- {
403
- return _mm256_set1_epi64x(a);
404
- }
346
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
405
347
 
406
- template<>
407
- EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(const Packet8i& a, const Packet8i& b)
408
- {
409
- #ifdef EIGEN_VECTORIZE_AVX2
410
- return _mm256_cmpeq_epi32(a, b);
411
- #else
412
- Packet4i a1, a2, b1, b2;
413
- split_two(a, a1, a2);
414
- split_two(b, b1, b2);
415
- return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
416
- #endif
417
- }
348
+ Packet y2 = pset1<Packet>(-1.9515295891E-4);
349
+ y2 = pmul(y2, z);
350
+ y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
351
+ y2 = pmul(y2, z);
352
+ y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
353
+ y2 = pmul(y2, z);
354
+ y2 = pmul(y2, x);
355
+ y2 = padd(y2, x);
418
356
 
419
- template<>
420
- EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(const Packet8i& a, int b)
421
- {
422
- #ifdef EIGEN_VECTORIZE_AVX2
423
- return _mm256_slli_epi32(a, b);
424
- #else
425
- Packet4i a1, a2;
426
- split_two(a, a1, a2);
427
- return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
428
- #endif
429
- }
357
+ /* select the correct result from the two polynoms */
358
+ xmm3 = poly_mask;
359
+ Packet ysin2 = pand(xmm3, y2);
360
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
361
+ Packet ysin1 = pandnot(y, xmm3);
362
+ #else
363
+ Packet ysin1 = pandnot(xmm3, y);
364
+ #endif
365
+ y2 = psub(y2, ysin2);
366
+ y = psub(y, ysin1);
430
367
 
431
- template<>
432
- EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(const Packet8i& a, int b)
433
- {
434
- #ifdef EIGEN_VECTORIZE_AVX2
435
- return _mm256_srli_epi32(a, b);
436
- #else
437
- Packet4i a1, a2;
438
- split_two(a, a1, a2);
439
- return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
440
- #endif
441
- }
368
+ xmm1 = padd(ysin1, ysin2);
369
+ xmm2 = padd(y, y2);
442
370
 
443
- template<>
444
- EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(const Packet8i& a, int b)
445
- {
446
- #ifdef EIGEN_VECTORIZE_AVX2
447
- return _mm256_slli_epi64(a, b);
448
- #else
449
- Packet4i a1, a2;
450
- split_two(a, a1, a2);
451
- return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
452
- #endif
453
- }
454
-
455
- template<>
456
- EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(const Packet8i& a, int b)
457
- {
458
- #ifdef EIGEN_VECTORIZE_AVX2
459
- return _mm256_srli_epi64(a, b);
460
- #else
461
- Packet4i a1, a2;
462
- split_two(a, a1, a2);
463
- return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
464
- #endif
465
- }
466
-
467
- template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b)
468
- {
469
- #ifdef EIGEN_VECTORIZE_AVX2
470
- return _mm256_add_epi32(a, b);
471
- #else
472
- Packet4i a1, a2, b1, b2;
473
- split_two(a, a1, a2);
474
- split_two(b, b1, b2);
475
- return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
476
- #endif
477
- }
478
-
479
- template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b)
480
- {
481
- #ifdef EIGEN_VECTORIZE_AVX2
482
- return _mm256_sub_epi32(a, b);
483
- #else
484
- Packet4i a1, a2, b1, b2;
485
- split_two(a, a1, a2);
486
- split_two(b, b1, b2);
487
- return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
488
- #endif
489
- }
490
-
491
- template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b)
492
- {
493
- #ifdef EIGEN_VECTORIZE_AVX2
494
- return _mm256_and_si256(a, b);
495
- #else
496
- return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
497
- #endif
498
- }
499
-
500
- template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b)
501
- {
502
- #ifdef EIGEN_VECTORIZE_AVX2
503
- return _mm256_andnot_si256(a, b);
504
- #else
505
- return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
506
- #endif
507
- }
508
-
509
- template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b)
510
- {
511
- #ifdef EIGEN_VECTORIZE_AVX2
512
- return _mm256_or_si256(a, b);
513
- #else
514
- return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
515
- #endif
516
- }
517
-
518
- template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b)
519
- {
520
- #ifdef EIGEN_VECTORIZE_AVX2
521
- return _mm256_xor_si256(a, b);
522
- #else
523
- return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
524
- #endif
525
- }
526
-
527
- template<>
528
- EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(const Packet8i& a, const Packet8i& b)
529
- {
530
- #ifdef EIGEN_VECTORIZE_AVX2
531
- return _mm256_cmpgt_epi32(b, a);
532
- #else
533
- Packet4i a1, a2, b1, b2;
534
- split_two(a, a1, a2);
535
- split_two(b, b1, b2);
536
- return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
537
- #endif
538
- }
539
-
540
- template<>
541
- EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(const Packet8f& a, const Packet8f& b)
542
- {
543
- return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
544
- }
545
-
546
- template<>
547
- EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(const Packet8f& a, const Packet8f& b)
548
- {
549
- return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
550
- }
551
-
552
- template<>
553
- EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(const Packet4d& a, const Packet4d& b)
554
- {
555
- return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
556
- }
557
-
558
- template<>
559
- EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(const Packet4d& a, const Packet4d& b)
560
- {
561
- return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
562
- }
563
-
564
- template<>
565
- EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
566
- {
567
- return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
568
- }
569
-
570
- template<>
571
- EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
572
- {
573
- return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
574
- }
575
-
576
- template<>
577
- EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket)
578
- {
579
- return _mm256_castps_si256(_mm256_blendv_ps(
580
- _mm256_castsi256_ps(elsePacket),
581
- _mm256_castsi256_ps(thenPacket),
582
- _mm256_castsi256_ps(ifPacket)
583
- ));
584
- }
585
-
586
- template<>
587
- EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
588
- {
589
- return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
590
- }
591
-
592
- template<>
593
- EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
594
- {
595
- return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
596
- }
597
-
598
- template<>
599
- EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(const int* addr, const Packet8i& index)
600
- {
601
- #ifdef EIGEN_VECTORIZE_AVX2
602
- return _mm256_i32gather_epi32(addr, index, 4);
603
- #else
604
- uint32_t u[8];
605
- _mm256_storeu_si256((Packet8i*)u, index);
606
- return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
607
- addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
608
- #endif
609
- }
610
-
611
- template<>
612
- EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(const float *addr, const Packet8i& index)
613
- {
614
- #ifdef EIGEN_VECTORIZE_AVX2
615
- return _mm256_i32gather_ps(addr, index, 4);
616
- #else
617
- uint32_t u[8];
618
- _mm256_storeu_si256((Packet8i*)u, index);
619
- return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
620
- addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
621
- #endif
622
- }
623
-
624
- template<>
625
- EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(const double *addr, const Packet8i& index, bool upperhalf)
626
- {
627
- #ifdef EIGEN_VECTORIZE_AVX2
628
- return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
629
- #else
630
- uint32_t u[8];
631
- _mm256_storeu_si256((Packet8i*)u, index);
632
- if (upperhalf)
633
- {
634
- return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
635
- }
636
- else
637
- {
638
- return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
639
- }
640
- #endif
371
+ /* update the sign */
372
+ s = pxor(xmm1, sign_bit_sin);
373
+ c = pxor(xmm2, sign_bit_cos);
641
374
  }
642
375
 
643
- template<>
644
- EIGEN_STRONG_INLINE int pmovemask<Packet8f>(const Packet8f& a)
376
+ template<typename Packet>
377
+ EIGEN_STRONG_INLINE typename std::enable_if<
378
+ IsDoublePacket<Packet>::value
379
+ >::type psincos(Packet x, Packet& s, Packet& c)
645
380
  {
646
- return _mm256_movemask_ps(a);
647
- }
381
+ Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
382
+ using IntPacket = decltype(reinterpret_to_int(x));
383
+ IntPacket emm0, emm2, emm4;
648
384
 
649
- template<>
650
- EIGEN_STRONG_INLINE int pmovemask<Packet4d>(const Packet4d& a)
651
- {
652
- return _mm256_movemask_pd(a);
653
- }
385
+ sign_bit_sin = x;
386
+ /* take the absolute value */
387
+ x = pabs(x);
388
+ /* extract the sign bit (upper one) */
389
+ sign_bit_sin = pext_sign(sign_bit_sin);
654
390
 
655
- template<>
656
- EIGEN_STRONG_INLINE int pmovemask<Packet8i>(const Packet8i& a)
657
- {
658
- return pmovemask(_mm256_castsi256_ps(a));
659
- }
391
+ /* scale by 4/Pi */
392
+ y = pmul(x, pset1<Packet>(1.27323954473516));
660
393
 
661
- template<>
662
- EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(const Packet8f& a)
663
- {
664
- return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
665
- }
394
+ /* store the integer part of y in emm2 */
395
+ emm2 = pcast64<Packet, IntPacket>(y);
666
396
 
667
- template<>
668
- EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(const Packet4d& a)
669
- {
670
- return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
671
- }
397
+ /* j=(j+1) & (~1) (see the cephes sources) */
398
+ emm2 = padd64(emm2, pseti64<IntPacket>(1));
399
+ emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
400
+ y = pcast64<IntPacket, Packet>(emm2);
672
401
 
673
- template<>
674
- EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(const Packet8i& a, const Packet8i& b)
675
- {
676
- #ifdef EIGEN_VECTORIZE_AVX2
677
- return _mm256_cmpeq_epi64(a, b);
678
- #else
679
- Packet4i a1, a2, b1, b2;
680
- split_two(a, a1, a2);
681
- split_two(b, b1, b2);
682
- return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
683
- #endif
684
- }
402
+ emm4 = emm2;
685
403
 
686
- template<>
687
- EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(const Packet8i& a, uint64_t b, uint64_t c)
688
- {
689
- uint64_t u[4];
690
- _mm256_storeu_si256((__m256i*)u, a);
691
- u[0] = u[0] * b + c;
692
- u[1] = u[1] * b + c;
693
- u[2] = u[2] * b + c;
694
- u[3] = u[3] * b + c;
695
- return _mm256_loadu_si256((__m256i*)u);
696
- }
697
- }
698
- }
699
- #endif
404
+ /* get the swap sign flag for the sine */
405
+ emm0 = pand(emm2, pseti64<IntPacket>(4));
406
+ emm0 = psll64<61>(emm0);
407
+ Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
700
408
 
701
- #ifdef EIGEN_VECTORIZE_SSE2
702
- #include <xmmintrin.h>
409
+ /* get the polynom selection mask for the sine*/
410
+ emm2 = pand(emm2, pseti64<IntPacket>(2));
703
411
 
704
- namespace Eigen
705
- {
706
- namespace internal
707
- {
708
- template<>
709
- struct reinterpreter<Packet4i>
710
- {
711
- EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x)
712
- {
713
- return _mm_castsi128_ps(x);
714
- }
412
+ emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
413
+ Packet poly_mask = reinterpret_to_double(emm2);
715
414
 
716
- EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x)
717
- {
718
- return _mm_castsi128_pd(x);
719
- }
415
+ /* The magic pass: "Extended precision modular arithmetic"
416
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
417
+ xmm1 = pset1<Packet>(-0.78515625);
418
+ xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
419
+ xmm3 = pset1<Packet>(-3.77489497744594108e-8);
420
+ xmm1 = pmul(y, xmm1);
421
+ xmm2 = pmul(y, xmm2);
422
+ xmm3 = pmul(y, xmm3);
423
+ x = padd(x, xmm1);
424
+ x = padd(x, xmm2);
425
+ x = padd(x, xmm3);
720
426
 
721
- EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x)
722
- {
723
- return x;
724
- }
725
- };
427
+ emm4 = psub64(emm4, pseti64<IntPacket>(2));
428
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
429
+ emm4 = pandnot(pseti64<IntPacket>(4), emm4);
430
+ #else
431
+ emm4 = pandnot(emm4, pseti64<IntPacket>(4));
432
+ #endif
433
+ emm4 = psll64<61>(emm4);
434
+ Packet sign_bit_cos = reinterpret_to_double(emm4);
435
+ sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
726
436
 
727
- template<>
728
- struct reinterpreter<Packet4f>
729
- {
730
- EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x)
731
- {
732
- return x;
733
- }
734
437
 
735
- EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x)
736
- {
737
- return _mm_castps_pd(x);
738
- }
438
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
439
+ Packet z = pmul(x, x);
440
+ y = pset1<Packet>(2.443315711809948E-005);
739
441
 
740
- EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x)
741
- {
742
- return _mm_castps_si128(x);
743
- }
744
- };
442
+ y = pmul(y, z);
443
+ y = padd(y, pset1<Packet>(-1.388731625493765E-003));
444
+ y = pmul(y, z);
445
+ y = padd(y, pset1<Packet>(4.166664568298827E-002));
446
+ y = pmul(y, z);
447
+ y = pmul(y, z);
448
+ Packet tmp = pmul(z, pset1<Packet>(0.5));
449
+ y = psub(y, tmp);
450
+ y = padd(y, pset1<Packet>(1));
745
451
 
746
- template<>
747
- struct reinterpreter<Packet2d>
748
- {
749
- EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
750
- {
751
- return _mm_castpd_ps(x);
752
- }
452
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
753
453
 
754
- EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x)
755
- {
756
- return x;
757
- }
454
+ Packet y2 = pset1<Packet>(-1.9515295891E-4);
455
+ y2 = pmul(y2, z);
456
+ y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
457
+ y2 = pmul(y2, z);
458
+ y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
459
+ y2 = pmul(y2, z);
460
+ y2 = pmul(y2, x);
461
+ y2 = padd(y2, x);
758
462
 
759
- EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x)
760
- {
761
- return _mm_castpd_si128(x);
762
- }
763
- };
463
+ /* select the correct result from the two polynoms */
464
+ xmm3 = poly_mask;
465
+ Packet ysin2 = pand(xmm3, y2);
466
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
467
+ Packet ysin1 = pandnot(y, xmm3);
468
+ #else
469
+ Packet ysin1 = pandnot(xmm3, y);
470
+ #endif
471
+ y2 = psub(y2, ysin2);
472
+ y = psub(y, ysin1);
764
473
 
765
- EIGEN_STRONG_INLINE void split_two(const Packet4i& x, uint64_t& a, uint64_t& b)
766
- {
767
- #ifdef EIGEN_VECTORIZE_SSE4_1
768
- a = _mm_extract_epi64(x, 0);
769
- b = _mm_extract_epi64(x, 1);
770
- #else
771
- uint64_t u[2];
772
- _mm_storeu_si128((__m128i*)u, x);
773
- a = u[0];
774
- b = u[1];
775
- #endif
776
- }
474
+ xmm1 = padd(ysin1, ysin2);
475
+ xmm2 = padd(y, y2);
777
476
 
778
- EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet4i& a, const Packet4i& b)
779
- {
780
- auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
781
- auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
782
- sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
783
- sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
784
- return _mm_or_si128(sa, sb);
477
+ /* update the sign */
478
+ s = pxor(xmm1, sign_bit_sin);
479
+ c = pxor(xmm2, sign_bit_cos);
785
480
  }
786
481
 
787
- template<>
788
- EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
482
+ template<typename Packet>
483
+ EIGEN_STRONG_INLINE typename std::enable_if<
484
+ IsDoublePacket<Packet>::value, Packet
485
+ >::type _psin(Packet x)
789
486
  {
790
- return _mm_set1_epi64x(a);
791
- }
487
+ Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
488
+ using IntPacket = decltype(reinterpret_to_int(x));
489
+ IntPacket emm0, emm2;
792
490
 
793
- template<>
794
- EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(const Packet4i& a, const Packet4i& b)
795
- {
796
- return _mm_cmpeq_epi32(a, b);
797
- }
491
+ sign_bit_sin = x;
492
+ /* take the absolute value */
493
+ x = pabs(x);
494
+ /* extract the sign bit (upper one) */
495
+ sign_bit_sin = pext_sign(sign_bit_sin);
798
496
 
799
- template<>
800
- EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(const Packet4i& a, int b)
801
- {
802
- return _mm_slli_epi32(a, b);
803
- }
497
+ /* scale by 4/Pi */
498
+ y = pmul(x, pset1<Packet>(1.27323954473516));
804
499
 
805
- template<>
806
- EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(const Packet4i& a, int b)
807
- {
808
- return _mm_srli_epi32(a, b);
809
- }
500
+ /* store the integer part of y in emm2 */
501
+ emm2 = pcast64<Packet, IntPacket>(y);
810
502
 
503
+ /* j=(j+1) & (~1) (see the cephes sources) */
504
+ emm2 = padd64(emm2, pseti64<IntPacket>(1));
505
+ emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
506
+ y = pcast64<IntPacket, Packet>(emm2);
811
507
 
812
- template<>
813
- EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(const Packet4i& a, int b)
814
- {
815
- return _mm_slli_epi64(a, b);
816
- }
508
+ /* get the swap sign flag for the sine */
509
+ emm0 = pand(emm2, pseti64<IntPacket>(4));
510
+ emm0 = psll64<61>(emm0);
511
+ Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
817
512
 
818
- template<>
819
- EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(const Packet4i& a, int b)
820
- {
821
- return _mm_srli_epi64(a, b);
822
- }
513
+ /* get the polynom selection mask for the sine*/
514
+ emm2 = pand(emm2, pseti64<IntPacket>(2));
823
515
 
824
- template<>
825
- EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(const Packet4i& a, const Packet4i& b)
826
- {
827
- return _mm_cmplt_epi32(a, b);
828
- }
516
+ emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
517
+ Packet poly_mask = reinterpret_to_double(emm2);
829
518
 
830
- template<>
831
- EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(const Packet4f& a, const Packet4f& b)
832
- {
833
- return _mm_cmplt_ps(a, b);
834
- }
519
+ /* The magic pass: "Extended precision modular arithmetic"
520
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
521
+ xmm1 = pset1<Packet>(-0.78515625);
522
+ xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
523
+ xmm3 = pset1<Packet>(-3.77489497744594108e-8);
524
+ xmm1 = pmul(y, xmm1);
525
+ xmm2 = pmul(y, xmm2);
526
+ xmm3 = pmul(y, xmm3);
527
+ x = padd(x, xmm1);
528
+ x = padd(x, xmm2);
529
+ x = padd(x, xmm3);
835
530
 
836
- template<>
837
- EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(const Packet4f& a, const Packet4f& b)
838
- {
839
- return _mm_cmple_ps(a, b);
840
- }
531
+ sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
841
532
 
842
- template<>
843
- EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
844
- {
845
- return _mm_cmplt_pd(a, b);
846
- }
847
533
 
848
- template<>
849
- EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(const Packet2d& a, const Packet2d& b)
850
- {
851
- return _mm_cmple_pd(a, b);
852
- }
534
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
535
+ Packet z = pmul(x, x);
536
+ y = pset1<Packet>(2.443315711809948E-005);
853
537
 
854
- template<>
855
- EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
856
- {
857
- #ifdef EIGEN_VECTORIZE_SSE4_1
858
- return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
859
- #else
860
- return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
861
- #endif
862
- }
538
+ y = pmul(y, z);
539
+ y = padd(y, pset1<Packet>(-1.388731625493765E-003));
540
+ y = pmul(y, z);
541
+ y = padd(y, pset1<Packet>(4.166664568298827E-002));
542
+ y = pmul(y, z);
543
+ y = pmul(y, z);
544
+ Packet tmp = pmul(z, pset1<Packet>(0.5));
545
+ y = psub(y, tmp);
546
+ y = padd(y, pset1<Packet>(1));
863
547
 
864
- template<>
865
- EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
866
- {
867
- return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
868
- }
548
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
869
549
 
870
- template<>
871
- EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket)
872
- {
873
- #ifdef EIGEN_VECTORIZE_SSE4_1
874
- return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
875
- #else
876
- return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
877
- #endif
878
- }
550
+ Packet y2 = pset1<Packet>(-1.9515295891E-4);
551
+ y2 = pmul(y2, z);
552
+ y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
553
+ y2 = pmul(y2, z);
554
+ y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
555
+ y2 = pmul(y2, z);
556
+ y2 = pmul(y2, x);
557
+ y2 = padd(y2, x);
879
558
 
880
- template<>
881
- EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
882
- {
883
- #ifdef EIGEN_VECTORIZE_SSE4_1
884
- return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
885
- #else
886
- return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
887
- #endif
888
- }
559
+ /* select the correct result from the two polynoms */
560
+ xmm3 = poly_mask;
561
+ Packet ysin2 = pand(xmm3, y2);
562
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
563
+ Packet ysin1 = pandnot(y, xmm3);
564
+ #else
565
+ Packet ysin1 = pandnot(xmm3, y);
566
+ #endif
889
567
 
568
+ xmm1 = padd(ysin1, ysin2);
890
569
 
891
- template<>
892
- EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
893
- {
894
- return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
570
+ /* update the sign */
571
+ return pxor(xmm1, sign_bit_sin);
895
572
  }
573
+ }
574
+ }
896
575
 
897
- template<>
898
- EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(const int* addr, const Packet4i& index)
899
- {
900
- #ifdef EIGEN_VECTORIZE_AVX2
901
- return _mm_i32gather_epi32(addr, index, 4);
902
- #else
903
- uint32_t u[4];
904
- _mm_storeu_si128((__m128i*)u, index);
905
- return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
576
+ #ifdef EIGEN_VECTORIZE_AVX
577
+ #include "arch/AVX/MorePacketMath.h"
906
578
  #endif
907
- }
908
579
 
909
- template<>
910
- EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(const float* addr, const Packet4i& index)
911
- {
912
- #ifdef EIGEN_VECTORIZE_AVX2
913
- return _mm_i32gather_ps(addr, index, 4);
914
- #else
915
- uint32_t u[4];
916
- _mm_storeu_si128((__m128i*)u, index);
917
- return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
580
+ #ifdef EIGEN_VECTORIZE_SSE2
581
+ #include "arch/SSE/MorePacketMath.h"
918
582
  #endif
919
- }
920
583
 
921
- template<>
922
- EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(const double* addr, const Packet4i& index, bool upperhalf)
923
- {
924
- #ifdef EIGEN_VECTORIZE_AVX2
925
- return _mm_i32gather_pd(addr, index, 8);
926
- #else
927
- uint32_t u[4];
928
- _mm_storeu_si128((__m128i*)u, index);
929
- if (upperhalf)
930
- {
931
- return _mm_setr_pd(addr[u[2]], addr[u[3]]);
932
- }
933
- else
934
- {
935
- return _mm_setr_pd(addr[u[0]], addr[u[1]]);
936
- }
584
+ #ifdef EIGEN_VECTORIZE_NEON
585
+ #include "arch/NEON/MorePacketMath.h"
937
586
  #endif
938
- }
939
-
940
- template<>
941
- EIGEN_STRONG_INLINE int pmovemask<Packet4f>(const Packet4f& a)
942
- {
943
- return _mm_movemask_ps(a);
944
- }
945
-
946
- template<>
947
- EIGEN_STRONG_INLINE int pmovemask<Packet2d>(const Packet2d& a)
948
- {
949
- return _mm_movemask_pd(a);
950
- }
951
587
 
952
- template<>
953
- EIGEN_STRONG_INLINE int pmovemask<Packet4i>(const Packet4i& a)
954
- {
955
- return pmovemask((Packet4f)_mm_castsi128_ps(a));
956
- }
957
-
958
- template<>
959
- EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(const Packet4f& a)
588
+ namespace Eigen
589
+ {
590
+ namespace internal
591
+ {
592
+ template<int b, typename Packet>
593
+ EIGEN_STRONG_INLINE Packet psll(const Packet& a)
960
594
  {
961
- #ifdef EIGEN_VECTORIZE_SSE4_1
962
- return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
963
- #else
964
- auto round = _MM_GET_ROUNDING_MODE();
965
- _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
966
- auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
967
- _MM_SET_ROUNDING_MODE(round);
968
- return ret;
969
- #endif
595
+ return BitShifter<Packet>{}.template sll<b>(a);
970
596
  }
971
597
 
972
- template<>
973
- EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(const Packet2d& a)
598
+ template<int _b, typename Packet>
599
+ EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b)
974
600
  {
975
- #ifdef EIGEN_VECTORIZE_SSE4_1
976
- return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
977
- #else
978
- auto round = _MM_GET_ROUNDING_MODE();
979
- _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
980
- auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
981
- _MM_SET_ROUNDING_MODE(round);
982
- return ret;
983
- #endif
601
+ return BitShifter<Packet>{}.template srl<_b>(a, b);
984
602
  }
985
603
 
986
- template<>
987
- EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(const Packet4i& a, const Packet4i& b)
604
+ template<int b, typename Packet>
605
+ EIGEN_STRONG_INLINE Packet psll64(const Packet& a)
988
606
  {
989
- #ifdef EIGEN_VECTORIZE_SSE4_1
990
- return _mm_cmpeq_epi64(a, b);
991
- #else
992
- Packet4i c = _mm_cmpeq_epi32(a, b);
993
- return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
994
- #endif
607
+ return BitShifter<Packet>{}.template sll64<b>(a);
995
608
  }
996
609
 
997
- template<>
998
- EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(const Packet4i& a, uint64_t b, uint64_t c)
610
+ template<int b, typename Packet>
611
+ EIGEN_STRONG_INLINE Packet psrl64(const Packet& a)
999
612
  {
1000
- uint64_t u[2];
1001
- _mm_storeu_si128((__m128i*)u, a);
1002
- u[0] = u[0] * b + c;
1003
- u[1] = u[1] * b + c;
1004
- return _mm_loadu_si128((__m128i*)u);
613
+ return BitShifter<Packet>{}.template srl64<b>(a);
1005
614
  }
1006
615
  }
1007
616
  }
1008
- #endif
1009
617
 
1010
- #endif
618
+ #endif