tomoto 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/ct.cpp +11 -11
  5. data/ext/tomoto/dmr.cpp +14 -13
  6. data/ext/tomoto/dt.cpp +14 -14
  7. data/ext/tomoto/extconf.rb +7 -5
  8. data/ext/tomoto/gdmr.cpp +7 -7
  9. data/ext/tomoto/hdp.cpp +9 -9
  10. data/ext/tomoto/hlda.cpp +13 -13
  11. data/ext/tomoto/hpa.cpp +5 -5
  12. data/ext/tomoto/lda.cpp +42 -39
  13. data/ext/tomoto/llda.cpp +6 -6
  14. data/ext/tomoto/mglda.cpp +15 -15
  15. data/ext/tomoto/pa.cpp +6 -6
  16. data/ext/tomoto/plda.cpp +6 -6
  17. data/ext/tomoto/slda.cpp +8 -8
  18. data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
  19. data/ext/tomoto/utils.h +16 -70
  20. data/lib/tomoto/version.rb +1 -1
  21. data/lib/tomoto.rb +5 -1
  22. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  23. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  24. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  25. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  26. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  27. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  28. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  29. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  30. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  31. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  32. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  33. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  34. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  35. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  36. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  37. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  38. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  39. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  40. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  41. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  42. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  43. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  44. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  45. data/vendor/EigenRand/README.md +57 -4
  46. data/vendor/eigen/COPYING.APACHE +203 -0
  47. data/vendor/eigen/COPYING.BSD +1 -1
  48. data/vendor/eigen/COPYING.MINPACK +51 -52
  49. data/vendor/eigen/Eigen/Cholesky +0 -1
  50. data/vendor/eigen/Eigen/Core +112 -265
  51. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  52. data/vendor/eigen/Eigen/Geometry +5 -8
  53. data/vendor/eigen/Eigen/Householder +0 -1
  54. data/vendor/eigen/Eigen/Jacobi +0 -1
  55. data/vendor/eigen/Eigen/KLUSupport +41 -0
  56. data/vendor/eigen/Eigen/LU +2 -5
  57. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  58. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  59. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  60. data/vendor/eigen/Eigen/QR +2 -3
  61. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  62. data/vendor/eigen/Eigen/SVD +0 -1
  63. data/vendor/eigen/Eigen/Sparse +0 -2
  64. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  65. data/vendor/eigen/Eigen/SparseLU +4 -0
  66. data/vendor/eigen/Eigen/SparseQR +0 -1
  67. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  68. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  69. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  70. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  71. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  72. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  73. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  74. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  75. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  76. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  77. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  78. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  79. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  80. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  81. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  82. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  84. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  85. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  86. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  87. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  88. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  89. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  90. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  91. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  92. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  93. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  94. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  95. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  96. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  97. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  98. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  99. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  100. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  101. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  102. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  103. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  104. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  105. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  106. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  107. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  108. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  109. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  110. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  111. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  112. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  113. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  114. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  115. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  116. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  117. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  118. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  119. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  120. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  121. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  122. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  123. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  124. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  125. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  126. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  127. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  128. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  129. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  130. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  131. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  132. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  133. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  134. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  135. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  136. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  137. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  138. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  139. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  140. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  141. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  142. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  143. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  145. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  146. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  148. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  149. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  153. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  154. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  156. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  160. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  161. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  162. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  169. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  171. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  172. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  173. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  174. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  175. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  176. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  177. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  178. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  179. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  180. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  181. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  182. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  183. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  184. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  185. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  186. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  187. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  188. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  189. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  190. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  191. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  192. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  193. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  194. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  195. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  196. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  197. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  198. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  199. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  200. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  201. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  202. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  203. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  204. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  205. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  206. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  207. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  208. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  209. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  210. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  211. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  212. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  213. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  214. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  215. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  216. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  217. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  218. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  219. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  220. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  221. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  222. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  223. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  224. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  225. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  226. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  227. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  228. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  229. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  230. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  231. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  232. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  233. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  234. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  235. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  236. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  237. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  238. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  239. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  240. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  241. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  242. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  243. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  244. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  245. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  246. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  247. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  248. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  249. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  250. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  251. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  252. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  253. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  254. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  255. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  256. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  257. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  258. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  259. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  260. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  261. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  262. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  263. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  264. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  265. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  266. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  267. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  268. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  269. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  270. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  271. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  272. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  273. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  274. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  275. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  276. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  277. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  278. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  279. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  280. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  281. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  282. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  283. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  284. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  285. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  295. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  296. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  297. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  298. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  299. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  300. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  307. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  308. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  309. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  310. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  311. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  312. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  313. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  314. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  315. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  316. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  317. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  318. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  319. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  320. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  321. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  322. data/vendor/eigen/README.md +2 -0
  323. data/vendor/eigen/bench/btl/README +1 -1
  324. data/vendor/eigen/bench/tensors/README +6 -7
  325. data/vendor/eigen/ci/README.md +56 -0
  326. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  327. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  328. data/vendor/eigen/unsupported/README.txt +1 -1
  329. data/vendor/tomotopy/README.kr.rst +78 -0
  330. data/vendor/tomotopy/README.rst +75 -0
  331. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  332. data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
  333. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
  334. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
  335. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
  336. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  337. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  338. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
  339. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
  340. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
  341. data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
  342. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
  343. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
  344. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
  345. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
  346. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
  347. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
  348. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  349. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
  350. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
  351. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
  352. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
  353. data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
  354. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  355. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
  356. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
  357. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  358. data/vendor/tomotopy/src/Utils/math.h +2 -2
  359. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  360. data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
  361. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  362. metadata +64 -18
  363. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  364. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  365. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  366. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  367. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  368. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  369. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -2,10 +2,10 @@
2
2
  * @file MorePacketMath.h
3
3
  * @author bab2min (bab2min@gmail.com)
4
4
  * @brief
5
- * @version 0.3.0
6
- * @date 2020-10-07
5
+ * @version 0.3.3
6
+ * @date 2021-03-31
7
7
  *
8
- * @copyright Copyright (c) 2020
8
+ * @copyright Copyright (c) 2020-2021
9
9
  *
10
10
  */
11
11
 
@@ -14,14 +14,26 @@
14
14
 
15
15
  #include <Eigen/Dense>
16
16
 
17
+ #define EIGENRAND_PRINT_PACKET(p) do { using _MTy = typename std::remove_const<typename std::remove_reference<decltype(p)>::type>::type; typename std::conditional<Eigen::internal::IsFloatPacket<_MTy>::value, float, typename std::conditional<Eigen::internal::IsDoublePacket<_MTy>::value, double, int>::type>::type f[4]; Eigen::internal::pstore(f, p); std::cout << #p " " << f[0] << " " << f[1] << " " << f[2] << " " << f[3] << std::endl; } while(0)
18
+
17
19
  namespace Eigen
18
20
  {
19
21
  namespace internal
20
22
  {
23
+ template<typename Ty>
24
+ struct IsIntPacket : std::false_type {};
25
+
26
+ template<typename Ty>
27
+ struct IsFloatPacket : std::false_type {};
28
+
29
+ template<typename Ty>
30
+ struct IsDoublePacket : std::false_type {};
31
+
32
+ template<typename Ty>
33
+ struct HalfPacket;
34
+
21
35
  template<typename Packet>
22
- struct reinterpreter
23
- {
24
- };
36
+ struct reinterpreter{};
25
37
 
26
38
  template<typename Packet>
27
39
  inline auto reinterpret_to_float(const Packet& x)
@@ -44,13 +56,40 @@ namespace Eigen
44
56
  return reinterpreter<Packet>{}.to_int(x);
45
57
  }
46
58
 
59
+ template<typename Packet>
60
+ EIGEN_STRONG_INLINE void split_two(const Packet& p, typename HalfPacket<Packet>::type& a, typename HalfPacket<Packet>::type& b);
61
+
47
62
  template<typename Packet>
48
63
  EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
49
64
 
65
+ template<typename Packet>
66
+ EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b);
67
+
68
+ template<typename Packet>
69
+ EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b);
70
+
71
+ template <typename SrcPacket, typename TgtPacket>
72
+ EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a);
73
+
50
74
  template<typename Packet>
51
75
  EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b);
52
76
 
53
77
  template<typename Packet>
78
+ struct BitShifter {};
79
+
80
+ template<int b, typename Packet>
81
+ EIGEN_STRONG_INLINE Packet psll(const Packet& a);
82
+
83
+ template<int _b, typename Packet>
84
+ EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b = _b);
85
+
86
+ template<int b, typename Packet>
87
+ EIGEN_STRONG_INLINE Packet psll64(const Packet& a);
88
+
89
+ template<int b, typename Packet>
90
+ EIGEN_STRONG_INLINE Packet psrl64(const Packet& a);
91
+
92
+ /*template<typename Packet>
54
93
  EIGEN_STRONG_INLINE Packet psll(const Packet& a, int b);
55
94
 
56
95
  template<typename Packet>
@@ -60,12 +99,34 @@ namespace Eigen
60
99
  EIGEN_STRONG_INLINE Packet psll64(const Packet& a, int b);
61
100
 
62
101
  template<typename Packet>
63
- EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);
102
+ EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);*/
64
103
 
65
104
  template<typename Packet>
66
105
  EIGEN_STRONG_INLINE int pmovemask(const Packet& a);
67
106
 
68
- template<>
107
+ template<typename Packet>
108
+ EIGEN_STRONG_INLINE typename std::enable_if<
109
+ IsFloatPacket<Packet>::value, Packet
110
+ >::type pext_sign(const Packet& a)
111
+ {
112
+ using IntPacket = decltype(reinterpret_to_int(a));
113
+ return reinterpret_to_float(
114
+ pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
115
+ );
116
+ }
117
+
118
+ template<typename Packet>
119
+ EIGEN_STRONG_INLINE typename std::enable_if<
120
+ IsDoublePacket<Packet>::value, Packet
121
+ >::type pext_sign(const Packet& a)
122
+ {
123
+ using IntPacket = decltype(reinterpret_to_int(a));
124
+ return reinterpret_to_double(
125
+ pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
126
+ );
127
+ }
128
+
129
+ /*template<>
69
130
  EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(const uint64_t& a, int b)
70
131
  {
71
132
  return a << b;
@@ -75,109 +136,11 @@ namespace Eigen
75
136
  EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(const uint64_t& a, int b)
76
137
  {
77
138
  return a >> b;
78
- }
79
-
80
- template<typename Packet>
81
- EIGEN_STRONG_INLINE void psincos(Packet x, Packet &s, Packet &c)
82
- {
83
- Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
84
- using IntPacket = decltype(reinterpret_to_int(x));
85
- IntPacket emm0, emm2, emm4;
86
-
87
- sign_bit_sin = x;
88
- /* take the absolute value */
89
- x = pabs(x);
90
- /* extract the sign bit (upper one) */
91
- sign_bit_sin = reinterpret_to_float(
92
- pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
93
- );
94
-
95
- /* scale by 4/Pi */
96
- y = pmul(x, pset1<Packet>(1.27323954473516));
97
-
98
- /* store the integer part of y in emm2 */
99
- emm2 = pcast<Packet, IntPacket>(y);
100
-
101
- /* j=(j+1) & (~1) (see the cephes sources) */
102
- emm2 = padd(emm2, pset1<IntPacket>(1));
103
- emm2 = pand(emm2, pset1<IntPacket>(~1));
104
- y = pcast<IntPacket, Packet>(emm2);
105
-
106
- emm4 = emm2;
107
-
108
- /* get the swap sign flag for the sine */
109
- emm0 = pand(emm2, pset1<IntPacket>(4));
110
- emm0 = psll(emm0, 29);
111
- Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
112
-
113
- /* get the polynom selection mask for the sine*/
114
- emm2 = pand(emm2, pset1<IntPacket>(2));
115
-
116
- emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
117
- Packet poly_mask = reinterpret_to_float(emm2);
118
-
119
- /* The magic pass: "Extended precision modular arithmetic"
120
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
121
- xmm1 = pset1<Packet>(-0.78515625);
122
- xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
123
- xmm3 = pset1<Packet>(-3.77489497744594108e-8);
124
- xmm1 = pmul(y, xmm1);
125
- xmm2 = pmul(y, xmm2);
126
- xmm3 = pmul(y, xmm3);
127
- x = padd(x, xmm1);
128
- x = padd(x, xmm2);
129
- x = padd(x, xmm3);
130
-
131
- emm4 = psub(emm4, pset1<IntPacket>(2));
132
- emm4 = pandnot(emm4, pset1<IntPacket>(4));
133
- emm4 = psll(emm4, 29);
134
- Packet sign_bit_cos = reinterpret_to_float(emm4);
135
- sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
136
-
137
-
138
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
139
- Packet z = pmul(x, x);
140
- y = pset1<Packet>(2.443315711809948E-005);
141
-
142
- y = pmul(y, z);
143
- y = padd(y, pset1<Packet>(-1.388731625493765E-003));
144
- y = pmul(y, z);
145
- y = padd(y, pset1<Packet>(4.166664568298827E-002));
146
- y = pmul(y, z);
147
- y = pmul(y, z);
148
- Packet tmp = pmul(z, pset1<Packet>(0.5));
149
- y = psub(y, tmp);
150
- y = padd(y, pset1<Packet>(1));
151
-
152
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
153
-
154
- Packet y2 = pset1<Packet>(-1.9515295891E-4);
155
- y2 = pmul(y2, z);
156
- y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
157
- y2 = pmul(y2, z);
158
- y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
159
- y2 = pmul(y2, z);
160
- y2 = pmul(y2, x);
161
- y2 = padd(y2, x);
162
-
163
- /* select the correct result from the two polynoms */
164
- xmm3 = poly_mask;
165
- Packet ysin2 = pand(xmm3, y2);
166
- Packet ysin1 = pandnot(xmm3, y);
167
- y2 = psub(y2, ysin2);
168
- y = psub(y, ysin1);
169
-
170
- xmm1 = padd(ysin1, ysin2);
171
- xmm2 = padd(y, y2);
172
-
173
- /* update the sign */
174
- s = pxor(xmm1, sign_bit_sin);
175
- c = pxor(xmm2, sign_bit_cos);
176
- }
139
+ }*/
177
140
 
178
141
  // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
179
142
  template<typename Packet>
180
- EIGEN_STRONG_INLINE Packet plgamma(const Packet& x)
143
+ EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x)
181
144
  {
182
145
  auto x_3 = padd(x, pset1<Packet>(3));
183
146
  auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
@@ -195,6 +158,9 @@ namespace Eigen
195
158
  template<typename Packet>
196
159
  EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
197
160
 
161
+ template<typename Packet>
162
+ EIGEN_STRONG_INLINE Packet pbitnot(const Packet& a);
163
+
198
164
  template<typename PacketIf, typename Packet>
199
165
  EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
200
166
 
@@ -213,6 +179,9 @@ namespace Eigen
213
179
  template<typename Packet>
214
180
  EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
215
181
 
182
+ template<typename Packet>
183
+ EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b);
184
+
216
185
  template<typename Packet>
217
186
  EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
218
187
 
@@ -241,10 +210,10 @@ namespace Eigen
241
210
  }
242
211
 
243
212
  template<typename _Scalar>
244
- struct bit_scalar;
213
+ struct BitScalar;
245
214
 
246
215
  template<>
247
- struct bit_scalar<float>
216
+ struct BitScalar<float>
248
217
  {
249
218
  float to_ur(uint32_t x)
250
219
  {
@@ -264,7 +233,7 @@ namespace Eigen
264
233
  };
265
234
 
266
235
  template<>
267
- struct bit_scalar<double>
236
+ struct BitScalar<double>
268
237
  {
269
238
  double to_ur(uint64_t x)
270
239
  {
@@ -291,720 +260,359 @@ namespace Eigen
291
260
 
292
261
  EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
293
262
  {
294
- bit_scalar<float> bs;
263
+ BitScalar<float> bs;
295
264
  float2 ret;
296
265
  ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
297
266
  ret.f[1] = bs.to_ur(x >> 32);
298
267
  return ret;
299
268
  }
300
- }
301
- }
302
-
303
- #ifdef EIGEN_VECTORIZE_AVX
304
- #include <immintrin.h>
305
269
 
306
- namespace Eigen
307
- {
308
- namespace internal
309
- {
310
- template<>
311
- struct reinterpreter<Packet8i>
270
+ template<typename Packet>
271
+ EIGEN_STRONG_INLINE typename std::enable_if<
272
+ IsFloatPacket<Packet>::value
273
+ >::type psincos(Packet x, Packet& s, Packet& c)
312
274
  {
313
- EIGEN_STRONG_INLINE Packet8f to_float(const Packet8i& x)
314
- {
315
- return _mm256_castsi256_ps(x);
316
- }
317
-
318
- EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x)
319
- {
320
- return _mm256_castsi256_pd(x);
321
- }
275
+ Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
276
+ using IntPacket = decltype(reinterpret_to_int(x));
277
+ IntPacket emm0, emm2, emm4;
322
278
 
323
- EIGEN_STRONG_INLINE Packet8i to_int(const Packet8i& x)
324
- {
325
- return x;
326
- }
327
- };
279
+ sign_bit_sin = x;
280
+ /* take the absolute value */
281
+ x = pabs(x);
282
+ /* extract the sign bit (upper one) */
283
+ sign_bit_sin = pext_sign(sign_bit_sin);
328
284
 
329
- template<>
330
- struct reinterpreter<Packet8f>
331
- {
332
- EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x)
333
- {
334
- return x;
335
- }
285
+ /* scale by 4/Pi */
286
+ y = pmul(x, pset1<Packet>(1.27323954473516));
336
287
 
337
- EIGEN_STRONG_INLINE Packet4d to_double(const Packet8f& x)
338
- {
339
- return _mm256_castps_pd(x);
340
- }
288
+ /* store the integer part of y in emm2 */
289
+ emm2 = pcast<Packet, IntPacket>(y);
341
290
 
342
- EIGEN_STRONG_INLINE Packet8i to_int(const Packet8f& x)
343
- {
344
- return _mm256_castps_si256(x);
345
- }
346
- };
291
+ /* j=(j+1) & (~1) (see the cephes sources) */
292
+ emm2 = padd(emm2, pset1<IntPacket>(1));
293
+ emm2 = pand(emm2, pset1<IntPacket>(~1));
294
+ y = pcast<IntPacket, Packet>(emm2);
347
295
 
348
- template<>
349
- struct reinterpreter<Packet4d>
350
- {
351
- EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x)
352
- {
353
- return _mm256_castpd_ps(x);
354
- }
296
+ emm4 = emm2;
355
297
 
356
- EIGEN_STRONG_INLINE Packet4d to_double(const Packet4d& x)
357
- {
358
- return x;
359
- }
298
+ /* get the swap sign flag for the sine */
299
+ emm0 = pand(emm2, pset1<IntPacket>(4));
300
+ emm0 = psll<29>(emm0);
301
+ Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
360
302
 
361
- EIGEN_STRONG_INLINE Packet8i to_int(const Packet4d& x)
362
- {
363
- return _mm256_castpd_si256(x);
364
- }
365
- };
303
+ /* get the polynom selection mask for the sine*/
304
+ emm2 = pand(emm2, pset1<IntPacket>(2));
366
305
 
367
- EIGEN_STRONG_INLINE void split_two(const Packet8i& x, Packet4i& a, Packet4i& b)
368
- {
369
- a = _mm256_extractf128_si256(x, 0);
370
- b = _mm256_extractf128_si256(x, 1);
371
- }
306
+ emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
307
+ Packet poly_mask = reinterpret_to_float(emm2);
372
308
 
373
- EIGEN_STRONG_INLINE Packet8i combine_two(const Packet4i& a, const Packet4i& b)
374
- {
375
- return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
376
- }
309
+ /* The magic pass: "Extended precision modular arithmetic"
310
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
311
+ xmm1 = pset1<Packet>(-0.78515625);
312
+ xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
313
+ xmm3 = pset1<Packet>(-3.77489497744594108e-8);
314
+ xmm1 = pmul(y, xmm1);
315
+ xmm2 = pmul(y, xmm2);
316
+ xmm3 = pmul(y, xmm3);
317
+ x = padd(x, xmm1);
318
+ x = padd(x, xmm2);
319
+ x = padd(x, xmm3);
377
320
 
378
- EIGEN_STRONG_INLINE void split_two(const Packet8f& x, Packet4f& a, Packet4f& b)
379
- {
380
- a = _mm256_extractf128_ps(x, 0);
381
- b = _mm256_extractf128_ps(x, 1);
382
- }
321
+ emm4 = psub(emm4, pset1<IntPacket>(2));
322
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
323
+ emm4 = pandnot(pset1<IntPacket>(4), emm4);
324
+ #else
325
+ emm4 = pandnot(emm4, pset1<IntPacket>(4));
326
+ #endif
327
+ emm4 = psll<29>(emm4);
328
+ Packet sign_bit_cos = reinterpret_to_float(emm4);
329
+ sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
383
330
 
384
- EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b)
385
- {
386
- return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
387
- }
388
331
 
332
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
333
+ Packet z = pmul(x, x);
334
+ y = pset1<Packet>(2.443315711809948E-005);
389
335
 
390
- EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet8i& a)
391
- {
392
- #ifdef EIGEN_VECTORIZE_AVX2
393
- return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
394
- #else
395
- auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
396
- return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
397
- #endif
398
- }
336
+ y = pmul(y, z);
337
+ y = padd(y, pset1<Packet>(-1.388731625493765E-003));
338
+ y = pmul(y, z);
339
+ y = padd(y, pset1<Packet>(4.166664568298827E-002));
340
+ y = pmul(y, z);
341
+ y = pmul(y, z);
342
+ Packet tmp = pmul(z, pset1<Packet>(0.5));
343
+ y = psub(y, tmp);
344
+ y = padd(y, pset1<Packet>(1));
399
345
 
400
- template<>
401
- EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
402
- {
403
- return _mm256_set1_epi64x(a);
404
- }
346
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
405
347
 
406
- template<>
407
- EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(const Packet8i& a, const Packet8i& b)
408
- {
409
- #ifdef EIGEN_VECTORIZE_AVX2
410
- return _mm256_cmpeq_epi32(a, b);
411
- #else
412
- Packet4i a1, a2, b1, b2;
413
- split_two(a, a1, a2);
414
- split_two(b, b1, b2);
415
- return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
416
- #endif
417
- }
348
+ Packet y2 = pset1<Packet>(-1.9515295891E-4);
349
+ y2 = pmul(y2, z);
350
+ y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
351
+ y2 = pmul(y2, z);
352
+ y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
353
+ y2 = pmul(y2, z);
354
+ y2 = pmul(y2, x);
355
+ y2 = padd(y2, x);
418
356
 
419
- template<>
420
- EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(const Packet8i& a, int b)
421
- {
422
- #ifdef EIGEN_VECTORIZE_AVX2
423
- return _mm256_slli_epi32(a, b);
424
- #else
425
- Packet4i a1, a2;
426
- split_two(a, a1, a2);
427
- return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
428
- #endif
429
- }
357
+ /* select the correct result from the two polynoms */
358
+ xmm3 = poly_mask;
359
+ Packet ysin2 = pand(xmm3, y2);
360
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
361
+ Packet ysin1 = pandnot(y, xmm3);
362
+ #else
363
+ Packet ysin1 = pandnot(xmm3, y);
364
+ #endif
365
+ y2 = psub(y2, ysin2);
366
+ y = psub(y, ysin1);
430
367
 
431
- template<>
432
- EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(const Packet8i& a, int b)
433
- {
434
- #ifdef EIGEN_VECTORIZE_AVX2
435
- return _mm256_srli_epi32(a, b);
436
- #else
437
- Packet4i a1, a2;
438
- split_two(a, a1, a2);
439
- return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
440
- #endif
441
- }
368
+ xmm1 = padd(ysin1, ysin2);
369
+ xmm2 = padd(y, y2);
442
370
 
443
- template<>
444
- EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(const Packet8i& a, int b)
445
- {
446
- #ifdef EIGEN_VECTORIZE_AVX2
447
- return _mm256_slli_epi64(a, b);
448
- #else
449
- Packet4i a1, a2;
450
- split_two(a, a1, a2);
451
- return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
452
- #endif
453
- }
454
-
455
- template<>
456
- EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(const Packet8i& a, int b)
457
- {
458
- #ifdef EIGEN_VECTORIZE_AVX2
459
- return _mm256_srli_epi64(a, b);
460
- #else
461
- Packet4i a1, a2;
462
- split_two(a, a1, a2);
463
- return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
464
- #endif
465
- }
466
-
467
- template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b)
468
- {
469
- #ifdef EIGEN_VECTORIZE_AVX2
470
- return _mm256_add_epi32(a, b);
471
- #else
472
- Packet4i a1, a2, b1, b2;
473
- split_two(a, a1, a2);
474
- split_two(b, b1, b2);
475
- return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
476
- #endif
477
- }
478
-
479
- template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b)
480
- {
481
- #ifdef EIGEN_VECTORIZE_AVX2
482
- return _mm256_sub_epi32(a, b);
483
- #else
484
- Packet4i a1, a2, b1, b2;
485
- split_two(a, a1, a2);
486
- split_two(b, b1, b2);
487
- return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
488
- #endif
489
- }
490
-
491
- template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b)
492
- {
493
- #ifdef EIGEN_VECTORIZE_AVX2
494
- return _mm256_and_si256(a, b);
495
- #else
496
- return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
497
- #endif
498
- }
499
-
500
- template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b)
501
- {
502
- #ifdef EIGEN_VECTORIZE_AVX2
503
- return _mm256_andnot_si256(a, b);
504
- #else
505
- return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
506
- #endif
507
- }
508
-
509
- template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b)
510
- {
511
- #ifdef EIGEN_VECTORIZE_AVX2
512
- return _mm256_or_si256(a, b);
513
- #else
514
- return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
515
- #endif
516
- }
517
-
518
- template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b)
519
- {
520
- #ifdef EIGEN_VECTORIZE_AVX2
521
- return _mm256_xor_si256(a, b);
522
- #else
523
- return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
524
- #endif
525
- }
526
-
527
- template<>
528
- EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(const Packet8i& a, const Packet8i& b)
529
- {
530
- #ifdef EIGEN_VECTORIZE_AVX2
531
- return _mm256_cmpgt_epi32(b, a);
532
- #else
533
- Packet4i a1, a2, b1, b2;
534
- split_two(a, a1, a2);
535
- split_two(b, b1, b2);
536
- return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
537
- #endif
538
- }
539
-
540
- template<>
541
- EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(const Packet8f& a, const Packet8f& b)
542
- {
543
- return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
544
- }
545
-
546
- template<>
547
- EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(const Packet8f& a, const Packet8f& b)
548
- {
549
- return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
550
- }
551
-
552
- template<>
553
- EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(const Packet4d& a, const Packet4d& b)
554
- {
555
- return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
556
- }
557
-
558
- template<>
559
- EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(const Packet4d& a, const Packet4d& b)
560
- {
561
- return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
562
- }
563
-
564
- template<>
565
- EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
566
- {
567
- return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
568
- }
569
-
570
- template<>
571
- EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
572
- {
573
- return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
574
- }
575
-
576
- template<>
577
- EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket)
578
- {
579
- return _mm256_castps_si256(_mm256_blendv_ps(
580
- _mm256_castsi256_ps(elsePacket),
581
- _mm256_castsi256_ps(thenPacket),
582
- _mm256_castsi256_ps(ifPacket)
583
- ));
584
- }
585
-
586
- template<>
587
- EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
588
- {
589
- return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
590
- }
591
-
592
- template<>
593
- EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
594
- {
595
- return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
596
- }
597
-
598
- template<>
599
- EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(const int* addr, const Packet8i& index)
600
- {
601
- #ifdef EIGEN_VECTORIZE_AVX2
602
- return _mm256_i32gather_epi32(addr, index, 4);
603
- #else
604
- uint32_t u[8];
605
- _mm256_storeu_si256((Packet8i*)u, index);
606
- return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
607
- addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
608
- #endif
609
- }
610
-
611
- template<>
612
- EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(const float *addr, const Packet8i& index)
613
- {
614
- #ifdef EIGEN_VECTORIZE_AVX2
615
- return _mm256_i32gather_ps(addr, index, 4);
616
- #else
617
- uint32_t u[8];
618
- _mm256_storeu_si256((Packet8i*)u, index);
619
- return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
620
- addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
621
- #endif
622
- }
623
-
624
- template<>
625
- EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(const double *addr, const Packet8i& index, bool upperhalf)
626
- {
627
- #ifdef EIGEN_VECTORIZE_AVX2
628
- return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
629
- #else
630
- uint32_t u[8];
631
- _mm256_storeu_si256((Packet8i*)u, index);
632
- if (upperhalf)
633
- {
634
- return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
635
- }
636
- else
637
- {
638
- return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
639
- }
640
- #endif
371
+ /* update the sign */
372
+ s = pxor(xmm1, sign_bit_sin);
373
+ c = pxor(xmm2, sign_bit_cos);
641
374
  }
642
375
 
643
- template<>
644
- EIGEN_STRONG_INLINE int pmovemask<Packet8f>(const Packet8f& a)
376
+ template<typename Packet>
377
+ EIGEN_STRONG_INLINE typename std::enable_if<
378
+ IsDoublePacket<Packet>::value
379
+ >::type psincos(Packet x, Packet& s, Packet& c)
645
380
  {
646
- return _mm256_movemask_ps(a);
647
- }
381
+ Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
382
+ using IntPacket = decltype(reinterpret_to_int(x));
383
+ IntPacket emm0, emm2, emm4;
648
384
 
649
- template<>
650
- EIGEN_STRONG_INLINE int pmovemask<Packet4d>(const Packet4d& a)
651
- {
652
- return _mm256_movemask_pd(a);
653
- }
385
+ sign_bit_sin = x;
386
+ /* take the absolute value */
387
+ x = pabs(x);
388
+ /* extract the sign bit (upper one) */
389
+ sign_bit_sin = pext_sign(sign_bit_sin);
654
390
 
655
- template<>
656
- EIGEN_STRONG_INLINE int pmovemask<Packet8i>(const Packet8i& a)
657
- {
658
- return pmovemask(_mm256_castsi256_ps(a));
659
- }
391
+ /* scale by 4/Pi */
392
+ y = pmul(x, pset1<Packet>(1.27323954473516));
660
393
 
661
- template<>
662
- EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(const Packet8f& a)
663
- {
664
- return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
665
- }
394
+ /* store the integer part of y in emm2 */
395
+ emm2 = pcast64<Packet, IntPacket>(y);
666
396
 
667
- template<>
668
- EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(const Packet4d& a)
669
- {
670
- return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
671
- }
397
+ /* j=(j+1) & (~1) (see the cephes sources) */
398
+ emm2 = padd64(emm2, pseti64<IntPacket>(1));
399
+ emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
400
+ y = pcast64<IntPacket, Packet>(emm2);
672
401
 
673
- template<>
674
- EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(const Packet8i& a, const Packet8i& b)
675
- {
676
- #ifdef EIGEN_VECTORIZE_AVX2
677
- return _mm256_cmpeq_epi64(a, b);
678
- #else
679
- Packet4i a1, a2, b1, b2;
680
- split_two(a, a1, a2);
681
- split_two(b, b1, b2);
682
- return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
683
- #endif
684
- }
402
+ emm4 = emm2;
685
403
 
686
- template<>
687
- EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(const Packet8i& a, uint64_t b, uint64_t c)
688
- {
689
- uint64_t u[4];
690
- _mm256_storeu_si256((__m256i*)u, a);
691
- u[0] = u[0] * b + c;
692
- u[1] = u[1] * b + c;
693
- u[2] = u[2] * b + c;
694
- u[3] = u[3] * b + c;
695
- return _mm256_loadu_si256((__m256i*)u);
696
- }
697
- }
698
- }
699
- #endif
404
+ /* get the swap sign flag for the sine */
405
+ emm0 = pand(emm2, pseti64<IntPacket>(4));
406
+ emm0 = psll64<61>(emm0);
407
+ Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
700
408
 
701
- #ifdef EIGEN_VECTORIZE_SSE2
702
- #include <xmmintrin.h>
409
+ /* get the polynom selection mask for the sine*/
410
+ emm2 = pand(emm2, pseti64<IntPacket>(2));
703
411
 
704
- namespace Eigen
705
- {
706
- namespace internal
707
- {
708
- template<>
709
- struct reinterpreter<Packet4i>
710
- {
711
- EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x)
712
- {
713
- return _mm_castsi128_ps(x);
714
- }
412
+ emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
413
+ Packet poly_mask = reinterpret_to_double(emm2);
715
414
 
716
- EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x)
717
- {
718
- return _mm_castsi128_pd(x);
719
- }
415
+ /* The magic pass: "Extended precision modular arithmetic"
416
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
417
+ xmm1 = pset1<Packet>(-0.78515625);
418
+ xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
419
+ xmm3 = pset1<Packet>(-3.77489497744594108e-8);
420
+ xmm1 = pmul(y, xmm1);
421
+ xmm2 = pmul(y, xmm2);
422
+ xmm3 = pmul(y, xmm3);
423
+ x = padd(x, xmm1);
424
+ x = padd(x, xmm2);
425
+ x = padd(x, xmm3);
720
426
 
721
- EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x)
722
- {
723
- return x;
724
- }
725
- };
427
+ emm4 = psub64(emm4, pseti64<IntPacket>(2));
428
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
429
+ emm4 = pandnot(pseti64<IntPacket>(4), emm4);
430
+ #else
431
+ emm4 = pandnot(emm4, pseti64<IntPacket>(4));
432
+ #endif
433
+ emm4 = psll64<61>(emm4);
434
+ Packet sign_bit_cos = reinterpret_to_double(emm4);
435
+ sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
726
436
 
727
- template<>
728
- struct reinterpreter<Packet4f>
729
- {
730
- EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x)
731
- {
732
- return x;
733
- }
734
437
 
735
- EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x)
736
- {
737
- return _mm_castps_pd(x);
738
- }
438
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
439
+ Packet z = pmul(x, x);
440
+ y = pset1<Packet>(2.443315711809948E-005);
739
441
 
740
- EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x)
741
- {
742
- return _mm_castps_si128(x);
743
- }
744
- };
442
+ y = pmul(y, z);
443
+ y = padd(y, pset1<Packet>(-1.388731625493765E-003));
444
+ y = pmul(y, z);
445
+ y = padd(y, pset1<Packet>(4.166664568298827E-002));
446
+ y = pmul(y, z);
447
+ y = pmul(y, z);
448
+ Packet tmp = pmul(z, pset1<Packet>(0.5));
449
+ y = psub(y, tmp);
450
+ y = padd(y, pset1<Packet>(1));
745
451
 
746
- template<>
747
- struct reinterpreter<Packet2d>
748
- {
749
- EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
750
- {
751
- return _mm_castpd_ps(x);
752
- }
452
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
753
453
 
754
- EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x)
755
- {
756
- return x;
757
- }
454
+ Packet y2 = pset1<Packet>(-1.9515295891E-4);
455
+ y2 = pmul(y2, z);
456
+ y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
457
+ y2 = pmul(y2, z);
458
+ y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
459
+ y2 = pmul(y2, z);
460
+ y2 = pmul(y2, x);
461
+ y2 = padd(y2, x);
758
462
 
759
- EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x)
760
- {
761
- return _mm_castpd_si128(x);
762
- }
763
- };
463
+ /* select the correct result from the two polynoms */
464
+ xmm3 = poly_mask;
465
+ Packet ysin2 = pand(xmm3, y2);
466
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
467
+ Packet ysin1 = pandnot(y, xmm3);
468
+ #else
469
+ Packet ysin1 = pandnot(xmm3, y);
470
+ #endif
471
+ y2 = psub(y2, ysin2);
472
+ y = psub(y, ysin1);
764
473
 
765
- EIGEN_STRONG_INLINE void split_two(const Packet4i& x, uint64_t& a, uint64_t& b)
766
- {
767
- #ifdef EIGEN_VECTORIZE_SSE4_1
768
- a = _mm_extract_epi64(x, 0);
769
- b = _mm_extract_epi64(x, 1);
770
- #else
771
- uint64_t u[2];
772
- _mm_storeu_si128((__m128i*)u, x);
773
- a = u[0];
774
- b = u[1];
775
- #endif
776
- }
474
+ xmm1 = padd(ysin1, ysin2);
475
+ xmm2 = padd(y, y2);
777
476
 
778
- EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet4i& a, const Packet4i& b)
779
- {
780
- auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
781
- auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
782
- sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
783
- sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
784
- return _mm_or_si128(sa, sb);
477
+ /* update the sign */
478
+ s = pxor(xmm1, sign_bit_sin);
479
+ c = pxor(xmm2, sign_bit_cos);
785
480
  }
786
481
 
787
- template<>
788
- EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
482
+ template<typename Packet>
483
+ EIGEN_STRONG_INLINE typename std::enable_if<
484
+ IsDoublePacket<Packet>::value, Packet
485
+ >::type _psin(Packet x)
789
486
  {
790
- return _mm_set1_epi64x(a);
791
- }
487
+ Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
488
+ using IntPacket = decltype(reinterpret_to_int(x));
489
+ IntPacket emm0, emm2;
792
490
 
793
- template<>
794
- EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(const Packet4i& a, const Packet4i& b)
795
- {
796
- return _mm_cmpeq_epi32(a, b);
797
- }
491
+ sign_bit_sin = x;
492
+ /* take the absolute value */
493
+ x = pabs(x);
494
+ /* extract the sign bit (upper one) */
495
+ sign_bit_sin = pext_sign(sign_bit_sin);
798
496
 
799
- template<>
800
- EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(const Packet4i& a, int b)
801
- {
802
- return _mm_slli_epi32(a, b);
803
- }
497
+ /* scale by 4/Pi */
498
+ y = pmul(x, pset1<Packet>(1.27323954473516));
804
499
 
805
- template<>
806
- EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(const Packet4i& a, int b)
807
- {
808
- return _mm_srli_epi32(a, b);
809
- }
500
+ /* store the integer part of y in emm2 */
501
+ emm2 = pcast64<Packet, IntPacket>(y);
810
502
 
503
+ /* j=(j+1) & (~1) (see the cephes sources) */
504
+ emm2 = padd64(emm2, pseti64<IntPacket>(1));
505
+ emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
506
+ y = pcast64<IntPacket, Packet>(emm2);
811
507
 
812
- template<>
813
- EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(const Packet4i& a, int b)
814
- {
815
- return _mm_slli_epi64(a, b);
816
- }
508
+ /* get the swap sign flag for the sine */
509
+ emm0 = pand(emm2, pseti64<IntPacket>(4));
510
+ emm0 = psll64<61>(emm0);
511
+ Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
817
512
 
818
- template<>
819
- EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(const Packet4i& a, int b)
820
- {
821
- return _mm_srli_epi64(a, b);
822
- }
513
+ /* get the polynom selection mask for the sine*/
514
+ emm2 = pand(emm2, pseti64<IntPacket>(2));
823
515
 
824
- template<>
825
- EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(const Packet4i& a, const Packet4i& b)
826
- {
827
- return _mm_cmplt_epi32(a, b);
828
- }
516
+ emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
517
+ Packet poly_mask = reinterpret_to_double(emm2);
829
518
 
830
- template<>
831
- EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(const Packet4f& a, const Packet4f& b)
832
- {
833
- return _mm_cmplt_ps(a, b);
834
- }
519
+ /* The magic pass: "Extended precision modular arithmetic"
520
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
521
+ xmm1 = pset1<Packet>(-0.78515625);
522
+ xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
523
+ xmm3 = pset1<Packet>(-3.77489497744594108e-8);
524
+ xmm1 = pmul(y, xmm1);
525
+ xmm2 = pmul(y, xmm2);
526
+ xmm3 = pmul(y, xmm3);
527
+ x = padd(x, xmm1);
528
+ x = padd(x, xmm2);
529
+ x = padd(x, xmm3);
835
530
 
836
- template<>
837
- EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(const Packet4f& a, const Packet4f& b)
838
- {
839
- return _mm_cmple_ps(a, b);
840
- }
531
+ sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
841
532
 
842
- template<>
843
- EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
844
- {
845
- return _mm_cmplt_pd(a, b);
846
- }
847
533
 
848
- template<>
849
- EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(const Packet2d& a, const Packet2d& b)
850
- {
851
- return _mm_cmple_pd(a, b);
852
- }
534
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
535
+ Packet z = pmul(x, x);
536
+ y = pset1<Packet>(2.443315711809948E-005);
853
537
 
854
- template<>
855
- EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
856
- {
857
- #ifdef EIGEN_VECTORIZE_SSE4_1
858
- return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
859
- #else
860
- return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
861
- #endif
862
- }
538
+ y = pmul(y, z);
539
+ y = padd(y, pset1<Packet>(-1.388731625493765E-003));
540
+ y = pmul(y, z);
541
+ y = padd(y, pset1<Packet>(4.166664568298827E-002));
542
+ y = pmul(y, z);
543
+ y = pmul(y, z);
544
+ Packet tmp = pmul(z, pset1<Packet>(0.5));
545
+ y = psub(y, tmp);
546
+ y = padd(y, pset1<Packet>(1));
863
547
 
864
- template<>
865
- EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
866
- {
867
- return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
868
- }
548
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
869
549
 
870
- template<>
871
- EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket)
872
- {
873
- #ifdef EIGEN_VECTORIZE_SSE4_1
874
- return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
875
- #else
876
- return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
877
- #endif
878
- }
550
+ Packet y2 = pset1<Packet>(-1.9515295891E-4);
551
+ y2 = pmul(y2, z);
552
+ y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
553
+ y2 = pmul(y2, z);
554
+ y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
555
+ y2 = pmul(y2, z);
556
+ y2 = pmul(y2, x);
557
+ y2 = padd(y2, x);
879
558
 
880
- template<>
881
- EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
882
- {
883
- #ifdef EIGEN_VECTORIZE_SSE4_1
884
- return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
885
- #else
886
- return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
887
- #endif
888
- }
559
+ /* select the correct result from the two polynoms */
560
+ xmm3 = poly_mask;
561
+ Packet ysin2 = pand(xmm3, y2);
562
+ #if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
563
+ Packet ysin1 = pandnot(y, xmm3);
564
+ #else
565
+ Packet ysin1 = pandnot(xmm3, y);
566
+ #endif
889
567
 
568
+ xmm1 = padd(ysin1, ysin2);
890
569
 
891
- template<>
892
- EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
893
- {
894
- return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
570
+ /* update the sign */
571
+ return pxor(xmm1, sign_bit_sin);
895
572
  }
573
+ }
574
+ }
896
575
 
897
- template<>
898
- EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(const int* addr, const Packet4i& index)
899
- {
900
- #ifdef EIGEN_VECTORIZE_AVX2
901
- return _mm_i32gather_epi32(addr, index, 4);
902
- #else
903
- uint32_t u[4];
904
- _mm_storeu_si128((__m128i*)u, index);
905
- return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
576
+ #ifdef EIGEN_VECTORIZE_AVX
577
+ #include "arch/AVX/MorePacketMath.h"
906
578
  #endif
907
- }
908
579
 
909
- template<>
910
- EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(const float* addr, const Packet4i& index)
911
- {
912
- #ifdef EIGEN_VECTORIZE_AVX2
913
- return _mm_i32gather_ps(addr, index, 4);
914
- #else
915
- uint32_t u[4];
916
- _mm_storeu_si128((__m128i*)u, index);
917
- return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
580
+ #ifdef EIGEN_VECTORIZE_SSE2
581
+ #include "arch/SSE/MorePacketMath.h"
918
582
  #endif
919
- }
920
583
 
921
- template<>
922
- EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(const double* addr, const Packet4i& index, bool upperhalf)
923
- {
924
- #ifdef EIGEN_VECTORIZE_AVX2
925
- return _mm_i32gather_pd(addr, index, 8);
926
- #else
927
- uint32_t u[4];
928
- _mm_storeu_si128((__m128i*)u, index);
929
- if (upperhalf)
930
- {
931
- return _mm_setr_pd(addr[u[2]], addr[u[3]]);
932
- }
933
- else
934
- {
935
- return _mm_setr_pd(addr[u[0]], addr[u[1]]);
936
- }
584
+ #ifdef EIGEN_VECTORIZE_NEON
585
+ #include "arch/NEON/MorePacketMath.h"
937
586
  #endif
938
- }
939
-
940
- template<>
941
- EIGEN_STRONG_INLINE int pmovemask<Packet4f>(const Packet4f& a)
942
- {
943
- return _mm_movemask_ps(a);
944
- }
945
-
946
- template<>
947
- EIGEN_STRONG_INLINE int pmovemask<Packet2d>(const Packet2d& a)
948
- {
949
- return _mm_movemask_pd(a);
950
- }
951
587
 
952
- template<>
953
- EIGEN_STRONG_INLINE int pmovemask<Packet4i>(const Packet4i& a)
954
- {
955
- return pmovemask((Packet4f)_mm_castsi128_ps(a));
956
- }
957
-
958
- template<>
959
- EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(const Packet4f& a)
588
+ namespace Eigen
589
+ {
590
+ namespace internal
591
+ {
592
+ template<int b, typename Packet>
593
+ EIGEN_STRONG_INLINE Packet psll(const Packet& a)
960
594
  {
961
- #ifdef EIGEN_VECTORIZE_SSE4_1
962
- return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
963
- #else
964
- auto round = _MM_GET_ROUNDING_MODE();
965
- _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
966
- auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
967
- _MM_SET_ROUNDING_MODE(round);
968
- return ret;
969
- #endif
595
+ return BitShifter<Packet>{}.template sll<b>(a);
970
596
  }
971
597
 
972
- template<>
973
- EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(const Packet2d& a)
598
+ template<int _b, typename Packet>
599
+ EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b)
974
600
  {
975
- #ifdef EIGEN_VECTORIZE_SSE4_1
976
- return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
977
- #else
978
- auto round = _MM_GET_ROUNDING_MODE();
979
- _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
980
- auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
981
- _MM_SET_ROUNDING_MODE(round);
982
- return ret;
983
- #endif
601
+ return BitShifter<Packet>{}.template srl<_b>(a, b);
984
602
  }
985
603
 
986
- template<>
987
- EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(const Packet4i& a, const Packet4i& b)
604
+ template<int b, typename Packet>
605
+ EIGEN_STRONG_INLINE Packet psll64(const Packet& a)
988
606
  {
989
- #ifdef EIGEN_VECTORIZE_SSE4_1
990
- return _mm_cmpeq_epi64(a, b);
991
- #else
992
- Packet4i c = _mm_cmpeq_epi32(a, b);
993
- return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
994
- #endif
607
+ return BitShifter<Packet>{}.template sll64<b>(a);
995
608
  }
996
609
 
997
- template<>
998
- EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(const Packet4i& a, uint64_t b, uint64_t c)
610
+ template<int b, typename Packet>
611
+ EIGEN_STRONG_INLINE Packet psrl64(const Packet& a)
999
612
  {
1000
- uint64_t u[2];
1001
- _mm_storeu_si128((__m128i*)u, a);
1002
- u[0] = u[0] * b + c;
1003
- u[1] = u[1] * b + c;
1004
- return _mm_loadu_si128((__m128i*)u);
613
+ return BitShifter<Packet>{}.template srl64<b>(a);
1005
614
  }
1006
615
  }
1007
616
  }
1008
- #endif
1009
617
 
1010
- #endif
618
+ #endif