tomoto 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (420) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +123 -0
  5. data/ext/tomoto/ext.cpp +245 -0
  6. data/ext/tomoto/extconf.rb +28 -0
  7. data/lib/tomoto.rb +12 -0
  8. data/lib/tomoto/ct.rb +11 -0
  9. data/lib/tomoto/hdp.rb +11 -0
  10. data/lib/tomoto/lda.rb +67 -0
  11. data/lib/tomoto/version.rb +3 -0
  12. data/vendor/EigenRand/EigenRand/Core.h +1139 -0
  13. data/vendor/EigenRand/EigenRand/Dists/Basic.h +111 -0
  14. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +877 -0
  15. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +108 -0
  16. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +626 -0
  17. data/vendor/EigenRand/EigenRand/EigenRand +19 -0
  18. data/vendor/EigenRand/EigenRand/Macro.h +24 -0
  19. data/vendor/EigenRand/EigenRand/MorePacketMath.h +978 -0
  20. data/vendor/EigenRand/EigenRand/PacketFilter.h +286 -0
  21. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +624 -0
  22. data/vendor/EigenRand/EigenRand/RandUtils.h +413 -0
  23. data/vendor/EigenRand/EigenRand/doc.h +220 -0
  24. data/vendor/EigenRand/LICENSE +21 -0
  25. data/vendor/EigenRand/README.md +288 -0
  26. data/vendor/eigen/COPYING.BSD +26 -0
  27. data/vendor/eigen/COPYING.GPL +674 -0
  28. data/vendor/eigen/COPYING.LGPL +502 -0
  29. data/vendor/eigen/COPYING.MINPACK +52 -0
  30. data/vendor/eigen/COPYING.MPL2 +373 -0
  31. data/vendor/eigen/COPYING.README +18 -0
  32. data/vendor/eigen/Eigen/CMakeLists.txt +19 -0
  33. data/vendor/eigen/Eigen/Cholesky +46 -0
  34. data/vendor/eigen/Eigen/CholmodSupport +48 -0
  35. data/vendor/eigen/Eigen/Core +537 -0
  36. data/vendor/eigen/Eigen/Dense +7 -0
  37. data/vendor/eigen/Eigen/Eigen +2 -0
  38. data/vendor/eigen/Eigen/Eigenvalues +61 -0
  39. data/vendor/eigen/Eigen/Geometry +62 -0
  40. data/vendor/eigen/Eigen/Householder +30 -0
  41. data/vendor/eigen/Eigen/IterativeLinearSolvers +48 -0
  42. data/vendor/eigen/Eigen/Jacobi +33 -0
  43. data/vendor/eigen/Eigen/LU +50 -0
  44. data/vendor/eigen/Eigen/MetisSupport +35 -0
  45. data/vendor/eigen/Eigen/OrderingMethods +73 -0
  46. data/vendor/eigen/Eigen/PaStiXSupport +48 -0
  47. data/vendor/eigen/Eigen/PardisoSupport +35 -0
  48. data/vendor/eigen/Eigen/QR +51 -0
  49. data/vendor/eigen/Eigen/QtAlignedMalloc +40 -0
  50. data/vendor/eigen/Eigen/SPQRSupport +34 -0
  51. data/vendor/eigen/Eigen/SVD +51 -0
  52. data/vendor/eigen/Eigen/Sparse +36 -0
  53. data/vendor/eigen/Eigen/SparseCholesky +45 -0
  54. data/vendor/eigen/Eigen/SparseCore +69 -0
  55. data/vendor/eigen/Eigen/SparseLU +46 -0
  56. data/vendor/eigen/Eigen/SparseQR +37 -0
  57. data/vendor/eigen/Eigen/StdDeque +27 -0
  58. data/vendor/eigen/Eigen/StdList +26 -0
  59. data/vendor/eigen/Eigen/StdVector +27 -0
  60. data/vendor/eigen/Eigen/SuperLUSupport +64 -0
  61. data/vendor/eigen/Eigen/UmfPackSupport +40 -0
  62. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +673 -0
  63. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +542 -0
  64. data/vendor/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  65. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +639 -0
  66. data/vendor/eigen/Eigen/src/Core/Array.h +329 -0
  67. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +226 -0
  68. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +209 -0
  69. data/vendor/eigen/Eigen/src/Core/Assign.h +90 -0
  70. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +935 -0
  71. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +178 -0
  72. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +353 -0
  73. data/vendor/eigen/Eigen/src/Core/Block.h +452 -0
  74. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +164 -0
  75. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +160 -0
  76. data/vendor/eigen/Eigen/src/Core/ConditionEstimator.h +175 -0
  77. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +1688 -0
  78. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +127 -0
  79. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +184 -0
  80. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +866 -0
  81. data/vendor/eigen/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  82. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +128 -0
  84. data/vendor/eigen/Eigen/src/Core/DenseBase.h +611 -0
  85. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +681 -0
  86. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +570 -0
  87. data/vendor/eigen/Eigen/src/Core/Diagonal.h +260 -0
  88. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +343 -0
  89. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +28 -0
  90. data/vendor/eigen/Eigen/src/Core/Dot.h +318 -0
  91. data/vendor/eigen/Eigen/src/Core/EigenBase.h +159 -0
  92. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +146 -0
  93. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +155 -0
  94. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +455 -0
  95. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +593 -0
  96. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +187 -0
  97. data/vendor/eigen/Eigen/src/Core/IO.h +225 -0
  98. data/vendor/eigen/Eigen/src/Core/Inverse.h +118 -0
  99. data/vendor/eigen/Eigen/src/Core/Map.h +171 -0
  100. data/vendor/eigen/Eigen/src/Core/MapBase.h +303 -0
  101. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +1415 -0
  102. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +101 -0
  103. data/vendor/eigen/Eigen/src/Core/Matrix.h +459 -0
  104. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +529 -0
  105. data/vendor/eigen/Eigen/src/Core/NestByValue.h +110 -0
  106. data/vendor/eigen/Eigen/src/Core/NoAlias.h +108 -0
  107. data/vendor/eigen/Eigen/src/Core/NumTraits.h +248 -0
  108. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +633 -0
  109. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +1035 -0
  110. data/vendor/eigen/Eigen/src/Core/Product.h +186 -0
  111. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +1112 -0
  112. data/vendor/eigen/Eigen/src/Core/Random.h +182 -0
  113. data/vendor/eigen/Eigen/src/Core/Redux.h +505 -0
  114. data/vendor/eigen/Eigen/src/Core/Ref.h +283 -0
  115. data/vendor/eigen/Eigen/src/Core/Replicate.h +142 -0
  116. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +117 -0
  117. data/vendor/eigen/Eigen/src/Core/Reverse.h +211 -0
  118. data/vendor/eigen/Eigen/src/Core/Select.h +162 -0
  119. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +352 -0
  120. data/vendor/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  121. data/vendor/eigen/Eigen/src/Core/Solve.h +188 -0
  122. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +235 -0
  123. data/vendor/eigen/Eigen/src/Core/SolverBase.h +130 -0
  124. data/vendor/eigen/Eigen/src/Core/StableNorm.h +221 -0
  125. data/vendor/eigen/Eigen/src/Core/Stride.h +111 -0
  126. data/vendor/eigen/Eigen/src/Core/Swap.h +67 -0
  127. data/vendor/eigen/Eigen/src/Core/Transpose.h +403 -0
  128. data/vendor/eigen/Eigen/src/Core/Transpositions.h +407 -0
  129. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +983 -0
  130. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +96 -0
  131. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +695 -0
  132. data/vendor/eigen/Eigen/src/Core/Visitor.h +273 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +451 -0
  134. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +439 -0
  135. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +637 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +51 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +391 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1316 -0
  139. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +430 -0
  140. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +322 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +1061 -0
  142. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +103 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +674 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +91 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +333 -0
  146. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +1124 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +212 -0
  148. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +29 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +49 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +490 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +91 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +760 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +471 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +562 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +895 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +77 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +397 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +137 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +945 -0
  160. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +168 -0
  161. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +475 -0
  162. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +188 -0
  163. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +136 -0
  164. data/vendor/eigen/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  165. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +792 -0
  166. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2156 -0
  167. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +492 -0
  168. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +311 -0
  169. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  170. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +122 -0
  171. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +619 -0
  172. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  173. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +163 -0
  174. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +521 -0
  175. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +287 -0
  176. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +260 -0
  177. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  178. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  179. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +93 -0
  180. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +466 -0
  181. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +315 -0
  182. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  183. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  184. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +335 -0
  185. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +163 -0
  186. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +145 -0
  187. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +398 -0
  188. data/vendor/eigen/Eigen/src/Core/util/Constants.h +547 -0
  189. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +83 -0
  190. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +302 -0
  191. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +130 -0
  192. data/vendor/eigen/Eigen/src/Core/util/Macros.h +1001 -0
  193. data/vendor/eigen/Eigen/src/Core/util/Memory.h +993 -0
  194. data/vendor/eigen/Eigen/src/Core/util/Meta.h +534 -0
  195. data/vendor/eigen/Eigen/src/Core/util/NonMPL2.h +3 -0
  196. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +27 -0
  197. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +218 -0
  198. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +821 -0
  199. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  200. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +459 -0
  201. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  202. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  203. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  204. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  205. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  206. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  207. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +654 -0
  208. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +546 -0
  209. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  210. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +870 -0
  211. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  212. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +556 -0
  213. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +392 -0
  214. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +247 -0
  215. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +114 -0
  216. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +497 -0
  217. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +282 -0
  218. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +234 -0
  219. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +195 -0
  220. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +814 -0
  221. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +199 -0
  222. data/vendor/eigen/Eigen/src/Geometry/RotationBase.h +206 -0
  223. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +170 -0
  224. data/vendor/eigen/Eigen/src/Geometry/Transform.h +1542 -0
  225. data/vendor/eigen/Eigen/src/Geometry/Translation.h +208 -0
  226. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +166 -0
  227. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +161 -0
  228. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +103 -0
  229. data/vendor/eigen/Eigen/src/Householder/Householder.h +172 -0
  230. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +470 -0
  231. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  232. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +228 -0
  233. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +246 -0
  234. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +400 -0
  235. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +462 -0
  236. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +394 -0
  237. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +216 -0
  238. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +115 -0
  239. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +462 -0
  240. data/vendor/eigen/Eigen/src/LU/Determinant.h +101 -0
  241. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +891 -0
  242. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +415 -0
  243. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +611 -0
  244. data/vendor/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  245. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +338 -0
  246. data/vendor/eigen/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  247. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +445 -0
  248. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +1843 -0
  249. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +157 -0
  250. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  251. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +543 -0
  252. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +653 -0
  253. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  254. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +562 -0
  255. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +676 -0
  256. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +409 -0
  257. data/vendor/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  258. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +313 -0
  259. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +1246 -0
  260. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +804 -0
  261. data/vendor/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  262. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +315 -0
  263. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  264. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +689 -0
  265. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +199 -0
  266. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +377 -0
  267. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +258 -0
  268. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  269. data/vendor/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  270. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +216 -0
  271. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +603 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +341 -0
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +726 -0
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +148 -0
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +320 -0
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDot.h +98 -0
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMap.h +305 -0
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +1403 -0
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +405 -0
  283. data/vendor/eigen/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +169 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseRedux.h +49 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +397 -0
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +656 -0
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +178 -0
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +478 -0
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +253 -0
  295. data/vendor/eigen/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  296. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +773 -0
  297. data/vendor/eigen/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  298. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  299. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  300. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +301 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  307. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  308. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  309. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  310. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  311. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  312. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  313. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +745 -0
  314. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +126 -0
  315. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +106 -0
  316. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +131 -0
  317. data/vendor/eigen/Eigen/src/StlSupport/details.h +84 -0
  318. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +1027 -0
  319. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +506 -0
  320. data/vendor/eigen/Eigen/src/misc/Image.h +82 -0
  321. data/vendor/eigen/Eigen/src/misc/Kernel.h +79 -0
  322. data/vendor/eigen/Eigen/src/misc/RealSvd2x2.h +55 -0
  323. data/vendor/eigen/Eigen/src/misc/blas.h +440 -0
  324. data/vendor/eigen/Eigen/src/misc/lapack.h +152 -0
  325. data/vendor/eigen/Eigen/src/misc/lapacke.h +16291 -0
  326. data/vendor/eigen/Eigen/src/misc/lapacke_mangling.h +17 -0
  327. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +332 -0
  328. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +552 -0
  329. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +1058 -0
  330. data/vendor/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  331. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +163 -0
  332. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  333. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +85 -0
  334. data/vendor/eigen/README.md +3 -0
  335. data/vendor/eigen/bench/README.txt +55 -0
  336. data/vendor/eigen/bench/btl/COPYING +340 -0
  337. data/vendor/eigen/bench/btl/README +154 -0
  338. data/vendor/eigen/bench/tensors/README +21 -0
  339. data/vendor/eigen/blas/README.txt +6 -0
  340. data/vendor/eigen/demos/mandelbrot/README +10 -0
  341. data/vendor/eigen/demos/mix_eigen_and_c/README +9 -0
  342. data/vendor/eigen/demos/opengl/README +13 -0
  343. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +1760 -0
  344. data/vendor/eigen/unsupported/README.txt +50 -0
  345. data/vendor/tomotopy/LICENSE +21 -0
  346. data/vendor/tomotopy/README.kr.rst +375 -0
  347. data/vendor/tomotopy/README.rst +382 -0
  348. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +362 -0
  349. data/vendor/tomotopy/src/Labeling/FoRelevance.h +88 -0
  350. data/vendor/tomotopy/src/Labeling/Labeler.h +50 -0
  351. data/vendor/tomotopy/src/TopicModel/CT.h +37 -0
  352. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +13 -0
  353. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +293 -0
  354. data/vendor/tomotopy/src/TopicModel/DMR.h +51 -0
  355. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +13 -0
  356. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +374 -0
  357. data/vendor/tomotopy/src/TopicModel/DT.h +65 -0
  358. data/vendor/tomotopy/src/TopicModel/DTM.h +22 -0
  359. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +15 -0
  360. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +572 -0
  361. data/vendor/tomotopy/src/TopicModel/GDMR.h +37 -0
  362. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +14 -0
  363. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +485 -0
  364. data/vendor/tomotopy/src/TopicModel/HDP.h +74 -0
  365. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +13 -0
  366. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +592 -0
  367. data/vendor/tomotopy/src/TopicModel/HLDA.h +40 -0
  368. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +13 -0
  369. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +681 -0
  370. data/vendor/tomotopy/src/TopicModel/HPA.h +27 -0
  371. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +21 -0
  372. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +588 -0
  373. data/vendor/tomotopy/src/TopicModel/LDA.h +144 -0
  374. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +442 -0
  375. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +13 -0
  376. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +1058 -0
  377. data/vendor/tomotopy/src/TopicModel/LLDA.h +45 -0
  378. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +13 -0
  379. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +203 -0
  380. data/vendor/tomotopy/src/TopicModel/MGLDA.h +63 -0
  381. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +17 -0
  382. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +558 -0
  383. data/vendor/tomotopy/src/TopicModel/PA.h +43 -0
  384. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +13 -0
  385. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +467 -0
  386. data/vendor/tomotopy/src/TopicModel/PLDA.h +17 -0
  387. data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +13 -0
  388. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +214 -0
  389. data/vendor/tomotopy/src/TopicModel/SLDA.h +54 -0
  390. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +17 -0
  391. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +456 -0
  392. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +692 -0
  393. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +169 -0
  394. data/vendor/tomotopy/src/Utils/Dictionary.h +80 -0
  395. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +181 -0
  396. data/vendor/tomotopy/src/Utils/LBFGS.h +202 -0
  397. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBacktracking.h +120 -0
  398. data/vendor/tomotopy/src/Utils/LBFGS/LineSearchBracketing.h +122 -0
  399. data/vendor/tomotopy/src/Utils/LBFGS/Param.h +213 -0
  400. data/vendor/tomotopy/src/Utils/LUT.hpp +82 -0
  401. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +69 -0
  402. data/vendor/tomotopy/src/Utils/PolyaGamma.hpp +200 -0
  403. data/vendor/tomotopy/src/Utils/PolyaGammaHybrid.hpp +672 -0
  404. data/vendor/tomotopy/src/Utils/ThreadPool.hpp +150 -0
  405. data/vendor/tomotopy/src/Utils/Trie.hpp +220 -0
  406. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +94 -0
  407. data/vendor/tomotopy/src/Utils/Utils.hpp +337 -0
  408. data/vendor/tomotopy/src/Utils/avx_gamma.h +46 -0
  409. data/vendor/tomotopy/src/Utils/avx_mathfun.h +736 -0
  410. data/vendor/tomotopy/src/Utils/exception.h +28 -0
  411. data/vendor/tomotopy/src/Utils/math.h +281 -0
  412. data/vendor/tomotopy/src/Utils/rtnorm.hpp +2690 -0
  413. data/vendor/tomotopy/src/Utils/sample.hpp +192 -0
  414. data/vendor/tomotopy/src/Utils/serializer.hpp +695 -0
  415. data/vendor/tomotopy/src/Utils/slp.hpp +131 -0
  416. data/vendor/tomotopy/src/Utils/sse_gamma.h +48 -0
  417. data/vendor/tomotopy/src/Utils/sse_mathfun.h +710 -0
  418. data/vendor/tomotopy/src/Utils/text.hpp +49 -0
  419. data/vendor/tomotopy/src/Utils/tvector.hpp +543 -0
  420. metadata +531 -0
@@ -0,0 +1,91 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // This Source Code Form is subject to the terms of the Mozilla
5
+ // Public License v. 2.0. If a copy of the MPL was not distributed
6
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
+
8
+ /* The sin, cos, exp, and log functions of this file come from
9
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
10
+ */
11
+
12
+ #ifndef EIGEN_MATH_FUNCTIONS_NEON_H
13
+ #define EIGEN_MATH_FUNCTIONS_NEON_H
14
+
15
+ namespace Eigen {
16
+
17
+ namespace internal {
18
+
19
+ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
20
+ Packet4f pexp<Packet4f>(const Packet4f& _x)
21
+ {
22
+ Packet4f x = _x;
23
+ Packet4f tmp, fx;
24
+
25
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
26
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
27
+ _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
28
+ _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
29
+ _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
30
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
31
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
32
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
33
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
34
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
35
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
36
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
37
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
38
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
39
+
40
+ x = vminq_f32(x, p4f_exp_hi);
41
+ x = vmaxq_f32(x, p4f_exp_lo);
42
+
43
+ /* express exp(x) as exp(g + n*log(2)) */
44
+ fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
45
+
46
+ /* perform a floorf */
47
+ tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
48
+
49
+ /* if greater, substract 1 */
50
+ Packet4ui mask = vcgtq_f32(tmp, fx);
51
+ mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
52
+
53
+ fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
54
+
55
+ tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
56
+ Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
57
+ x = vsubq_f32(x, tmp);
58
+ x = vsubq_f32(x, z);
59
+
60
+ Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
61
+ z = vmulq_f32(x, x);
62
+ y = vaddq_f32(y, p4f_cephes_exp_p1);
63
+ y = vmulq_f32(y, x);
64
+ y = vaddq_f32(y, p4f_cephes_exp_p2);
65
+ y = vmulq_f32(y, x);
66
+ y = vaddq_f32(y, p4f_cephes_exp_p3);
67
+ y = vmulq_f32(y, x);
68
+ y = vaddq_f32(y, p4f_cephes_exp_p4);
69
+ y = vmulq_f32(y, x);
70
+ y = vaddq_f32(y, p4f_cephes_exp_p5);
71
+
72
+ y = vmulq_f32(y, z);
73
+ y = vaddq_f32(y, x);
74
+ y = vaddq_f32(y, p4f_1);
75
+
76
+ /* build 2^n */
77
+ int32x4_t mm;
78
+ mm = vcvtq_s32_f32(fx);
79
+ mm = vaddq_s32(mm, p4i_0x7f);
80
+ mm = vshlq_n_s32(mm, 23);
81
+ Packet4f pow2n = vreinterpretq_f32_s32(mm);
82
+
83
+ y = vmulq_f32(y, pow2n);
84
+ return y;
85
+ }
86
+
87
+ } // end namespace internal
88
+
89
+ } // end namespace Eigen
90
+
91
+ #endif // EIGEN_MATH_FUNCTIONS_NEON_H
@@ -0,0 +1,760 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5
+ // Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6
+ // Heavily based on Gael's SSE version.
7
+ //
8
+ // This Source Code Form is subject to the terms of the Mozilla
9
+ // Public License v. 2.0. If a copy of the MPL was not distributed
10
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
+
12
+ #ifndef EIGEN_PACKET_MATH_NEON_H
13
+ #define EIGEN_PACKET_MATH_NEON_H
14
+
15
+ namespace Eigen {
16
+
17
+ namespace internal {
18
+
19
+ #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21
+ #endif
22
+
23
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25
+ #endif
26
+
27
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
28
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
29
+ #endif
30
+
31
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32
+ #if EIGEN_ARCH_ARM64
33
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
34
+ #else
35
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
36
+ #endif
37
+ #endif
38
+
39
+ #if EIGEN_COMP_MSVC
40
+
41
+ // In MSVC's arm_neon.h header file, all NEON vector types
42
+ // are aliases to the same underlying type __n128.
43
+ // We thus have to wrap them to make them different C++ types.
44
+ // (See also bug 1428)
45
+
46
+ template<typename T,int unique_id>
47
+ struct eigen_packet_wrapper
48
+ {
49
+ operator T&() { return m_val; }
50
+ operator const T&() const { return m_val; }
51
+ eigen_packet_wrapper() {}
52
+ eigen_packet_wrapper(const T &v) : m_val(v) {}
53
+ eigen_packet_wrapper& operator=(const T &v) {
54
+ m_val = v;
55
+ return *this;
56
+ }
57
+
58
+ T m_val;
59
+ };
60
+ typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
61
+ typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
62
+ typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
63
+ typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
64
+ typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
65
+
66
+ #else
67
+
68
+ typedef float32x2_t Packet2f;
69
+ typedef float32x4_t Packet4f;
70
+ typedef int32x4_t Packet4i;
71
+ typedef int32x2_t Packet2i;
72
+ typedef uint32x4_t Packet4ui;
73
+
74
+ #endif // EIGEN_COMP_MSVC
75
+
76
+ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
77
+ const Packet4f p4f_##NAME = pset1<Packet4f>(X)
78
+
79
+ #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
80
+ const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
81
+
82
+ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
83
+ const Packet4i p4i_##NAME = pset1<Packet4i>(X)
84
+
85
+ #if EIGEN_ARCH_ARM64
86
+ // __builtin_prefetch tends to do nothing on ARM64 compilers because the
87
+ // prefetch instructions there are too detailed for __builtin_prefetch to map
88
+ // meaningfully to them.
89
+ #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
90
+ #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
91
+ #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
92
+ #elif defined __pld
93
+ #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
94
+ #elif EIGEN_ARCH_ARM32
95
+ #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
96
+ #else
97
+ // by default no explicit prefetching
98
+ #define EIGEN_ARM_PREFETCH(ADDR)
99
+ #endif
100
+
101
+ template<> struct packet_traits<float> : default_packet_traits
102
+ {
103
+ typedef Packet4f type;
104
+ typedef Packet4f half; // Packet2f intrinsics not implemented yet
105
+ enum {
106
+ Vectorizable = 1,
107
+ AlignedOnScalar = 1,
108
+ size = 4,
109
+ HasHalfPacket=0, // Packet2f intrinsics not implemented yet
110
+
111
+ HasDiv = 1,
112
+ // FIXME check the Has*
113
+ HasSin = 0,
114
+ HasCos = 0,
115
+ HasLog = 0,
116
+ HasExp = 1,
117
+ HasSqrt = 0
118
+ };
119
+ };
120
+ template<> struct packet_traits<int32_t> : default_packet_traits
121
+ {
122
+ typedef Packet4i type;
123
+ typedef Packet4i half; // Packet2i intrinsics not implemented yet
124
+ enum {
125
+ Vectorizable = 1,
126
+ AlignedOnScalar = 1,
127
+ size=4,
128
+ HasHalfPacket=0 // Packet2i intrinsics not implemented yet
129
+ // FIXME check the Has*
130
+ };
131
+ };
132
+
133
+ #if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
134
+ // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
135
+ EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
136
+ EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
137
+ EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
138
+ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
139
+ EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
140
+ #endif
141
+
142
+ template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
143
+ template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
144
+
145
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
146
+ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
147
+
148
+ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
149
+ {
150
+ const float f[] = {0, 1, 2, 3};
151
+ Packet4f countdown = vld1q_f32(f);
152
+ return vaddq_f32(pset1<Packet4f>(a), countdown);
153
+ }
154
+ template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
155
+ {
156
+ const int32_t i[] = {0, 1, 2, 3};
157
+ Packet4i countdown = vld1q_s32(i);
158
+ return vaddq_s32(pset1<Packet4i>(a), countdown);
159
+ }
160
+
161
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
162
+ template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
163
+
164
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
165
+ template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
166
+
167
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
168
+ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
169
+
170
+ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
171
+ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
172
+
173
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
174
+ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
175
+
176
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
177
+ {
178
+ #if EIGEN_ARCH_ARM64
179
+ return vdivq_f32(a,b);
180
+ #else
181
+ Packet4f inv, restep, div;
182
+
183
+ // NEON does not offer a divide instruction, we have to do a reciprocal approximation
184
+ // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
185
+ // a reciprocal estimate AND a reciprocal step -which saves a few instructions
186
+ // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
187
+ // Newton-Raphson and vrecpsq_f32()
188
+ inv = vrecpeq_f32(b);
189
+
190
+ // This returns a differential, by which we will have to multiply inv to get a better
191
+ // approximation of 1/b.
192
+ restep = vrecpsq_f32(b, inv);
193
+ inv = vmulq_f32(restep, inv);
194
+
195
+ // Finally, multiply a by 1/b and get the wanted result of the division.
196
+ div = vmulq_f32(a, inv);
197
+
198
+ return div;
199
+ #endif
200
+ }
201
+
202
+ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
203
+ { eigen_assert(false && "packet integer division are not supported by NEON");
204
+ return pset1<Packet4i>(0);
205
+ }
206
+
207
+ // Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
208
+ // then implements a slow software scalar fallback calling fmaf()!
209
+ // Filed LLVM bug:
210
+ // https://llvm.org/bugs/show_bug.cgi?id=27216
211
+ #if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
212
+ // See bug 936.
213
+ // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
214
+ // FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
215
+ // MLA is not fused i.e. does 2 roundings.
216
+ // In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
217
+ // MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
218
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
219
+ #else
220
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
221
+ #if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
222
+ // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
223
+ // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
224
+ // -march=armv7-a, that is a very common case.
225
+ // See e.g. this thread:
226
+ // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
227
+ // Filed LLVM bug:
228
+ // https://llvm.org/bugs/show_bug.cgi?id=27219
229
+ Packet4f r = c;
230
+ asm volatile(
231
+ "vmla.f32 %q[r], %q[a], %q[b]"
232
+ : [r] "+w" (r)
233
+ : [a] "w" (a),
234
+ [b] "w" (b)
235
+ : );
236
+ return r;
237
+ #else
238
+ return vmlaq_f32(c,a,b);
239
+ #endif
240
+ }
241
+ #endif
242
+
243
+ // No FMA instruction for int, so use MLA unconditionally.
244
+ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
245
+
246
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
247
+ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
248
+
249
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
250
+ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
251
+
252
+ // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
253
+ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
254
+ {
255
+ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
256
+ }
257
+ template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
258
+
259
+ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
260
+ {
261
+ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
262
+ }
263
+ template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
264
+
265
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
266
+ {
267
+ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
268
+ }
269
+ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
270
+
271
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
272
+ {
273
+ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
274
+ }
275
+ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
276
+
277
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
278
+ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
279
+
280
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
281
+ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
282
+
283
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
284
+ {
285
+ float32x2_t lo, hi;
286
+ lo = vld1_dup_f32(from);
287
+ hi = vld1_dup_f32(from+1);
288
+ return vcombine_f32(lo, hi);
289
+ }
290
+ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
291
+ {
292
+ int32x2_t lo, hi;
293
+ lo = vld1_dup_s32(from);
294
+ hi = vld1_dup_s32(from+1);
295
+ return vcombine_s32(lo, hi);
296
+ }
297
+
298
+ template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
299
+ template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
300
+
301
+ template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
302
+ template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
303
+
304
+ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
305
+ {
306
+ Packet4f res = pset1<Packet4f>(0.f);
307
+ res = vsetq_lane_f32(from[0*stride], res, 0);
308
+ res = vsetq_lane_f32(from[1*stride], res, 1);
309
+ res = vsetq_lane_f32(from[2*stride], res, 2);
310
+ res = vsetq_lane_f32(from[3*stride], res, 3);
311
+ return res;
312
+ }
313
+ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
314
+ {
315
+ Packet4i res = pset1<Packet4i>(0);
316
+ res = vsetq_lane_s32(from[0*stride], res, 0);
317
+ res = vsetq_lane_s32(from[1*stride], res, 1);
318
+ res = vsetq_lane_s32(from[2*stride], res, 2);
319
+ res = vsetq_lane_s32(from[3*stride], res, 3);
320
+ return res;
321
+ }
322
+
323
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
324
+ {
325
+ to[stride*0] = vgetq_lane_f32(from, 0);
326
+ to[stride*1] = vgetq_lane_f32(from, 1);
327
+ to[stride*2] = vgetq_lane_f32(from, 2);
328
+ to[stride*3] = vgetq_lane_f32(from, 3);
329
+ }
330
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
331
+ {
332
+ to[stride*0] = vgetq_lane_s32(from, 0);
333
+ to[stride*1] = vgetq_lane_s32(from, 1);
334
+ to[stride*2] = vgetq_lane_s32(from, 2);
335
+ to[stride*3] = vgetq_lane_s32(from, 3);
336
+ }
337
+
338
+ template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); }
339
+ template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
340
+
341
+ // FIXME only store the 2 first elements ?
342
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
343
+ template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
344
+
345
+ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
346
+ float32x2_t a_lo, a_hi;
347
+ Packet4f a_r64;
348
+
349
+ a_r64 = vrev64q_f32(a);
350
+ a_lo = vget_low_f32(a_r64);
351
+ a_hi = vget_high_f32(a_r64);
352
+ return vcombine_f32(a_hi, a_lo);
353
+ }
354
+ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
355
+ int32x2_t a_lo, a_hi;
356
+ Packet4i a_r64;
357
+
358
+ a_r64 = vrev64q_s32(a);
359
+ a_lo = vget_low_s32(a_r64);
360
+ a_hi = vget_high_s32(a_r64);
361
+ return vcombine_s32(a_hi, a_lo);
362
+ }
363
+
364
+ template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
365
+ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
366
+
367
+ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
368
+ {
369
+ float32x2_t a_lo, a_hi, sum;
370
+
371
+ a_lo = vget_low_f32(a);
372
+ a_hi = vget_high_f32(a);
373
+ sum = vpadd_f32(a_lo, a_hi);
374
+ sum = vpadd_f32(sum, sum);
375
+ return vget_lane_f32(sum, 0);
376
+ }
377
+
378
+ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
379
+ {
380
+ float32x4x2_t vtrn1, vtrn2, res1, res2;
381
+ Packet4f sum1, sum2, sum;
382
+
383
+ // NEON zip performs interleaving of the supplied vectors.
384
+ // We perform two interleaves in a row to acquire the transposed vector
385
+ vtrn1 = vzipq_f32(vecs[0], vecs[2]);
386
+ vtrn2 = vzipq_f32(vecs[1], vecs[3]);
387
+ res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
388
+ res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
389
+
390
+ // Do the addition of the resulting vectors
391
+ sum1 = vaddq_f32(res1.val[0], res1.val[1]);
392
+ sum2 = vaddq_f32(res2.val[0], res2.val[1]);
393
+ sum = vaddq_f32(sum1, sum2);
394
+
395
+ return sum;
396
+ }
397
+
398
+ template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
399
+ {
400
+ int32x2_t a_lo, a_hi, sum;
401
+
402
+ a_lo = vget_low_s32(a);
403
+ a_hi = vget_high_s32(a);
404
+ sum = vpadd_s32(a_lo, a_hi);
405
+ sum = vpadd_s32(sum, sum);
406
+ return vget_lane_s32(sum, 0);
407
+ }
408
+
409
+ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
410
+ {
411
+ int32x4x2_t vtrn1, vtrn2, res1, res2;
412
+ Packet4i sum1, sum2, sum;
413
+
414
+ // NEON zip performs interleaving of the supplied vectors.
415
+ // We perform two interleaves in a row to acquire the transposed vector
416
+ vtrn1 = vzipq_s32(vecs[0], vecs[2]);
417
+ vtrn2 = vzipq_s32(vecs[1], vecs[3]);
418
+ res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
419
+ res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
420
+
421
+ // Do the addition of the resulting vectors
422
+ sum1 = vaddq_s32(res1.val[0], res1.val[1]);
423
+ sum2 = vaddq_s32(res2.val[0], res2.val[1]);
424
+ sum = vaddq_s32(sum1, sum2);
425
+
426
+ return sum;
427
+ }
428
+
429
+ // Other reduction functions:
430
+ // mul
431
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
432
+ {
433
+ float32x2_t a_lo, a_hi, prod;
434
+
435
+ // Get a_lo = |a1|a2| and a_hi = |a3|a4|
436
+ a_lo = vget_low_f32(a);
437
+ a_hi = vget_high_f32(a);
438
+ // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
439
+ prod = vmul_f32(a_lo, a_hi);
440
+ // Multiply prod with its swapped value |a2*a4|a1*a3|
441
+ prod = vmul_f32(prod, vrev64_f32(prod));
442
+
443
+ return vget_lane_f32(prod, 0);
444
+ }
445
+ template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
446
+ {
447
+ int32x2_t a_lo, a_hi, prod;
448
+
449
+ // Get a_lo = |a1|a2| and a_hi = |a3|a4|
450
+ a_lo = vget_low_s32(a);
451
+ a_hi = vget_high_s32(a);
452
+ // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
453
+ prod = vmul_s32(a_lo, a_hi);
454
+ // Multiply prod with its swapped value |a2*a4|a1*a3|
455
+ prod = vmul_s32(prod, vrev64_s32(prod));
456
+
457
+ return vget_lane_s32(prod, 0);
458
+ }
459
+
460
+ // min
461
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
462
+ {
463
+ float32x2_t a_lo, a_hi, min;
464
+
465
+ a_lo = vget_low_f32(a);
466
+ a_hi = vget_high_f32(a);
467
+ min = vpmin_f32(a_lo, a_hi);
468
+ min = vpmin_f32(min, min);
469
+
470
+ return vget_lane_f32(min, 0);
471
+ }
472
+
473
+ template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
474
+ {
475
+ int32x2_t a_lo, a_hi, min;
476
+
477
+ a_lo = vget_low_s32(a);
478
+ a_hi = vget_high_s32(a);
479
+ min = vpmin_s32(a_lo, a_hi);
480
+ min = vpmin_s32(min, min);
481
+
482
+ return vget_lane_s32(min, 0);
483
+ }
484
+
485
+ // max
486
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
487
+ {
488
+ float32x2_t a_lo, a_hi, max;
489
+
490
+ a_lo = vget_low_f32(a);
491
+ a_hi = vget_high_f32(a);
492
+ max = vpmax_f32(a_lo, a_hi);
493
+ max = vpmax_f32(max, max);
494
+
495
+ return vget_lane_f32(max, 0);
496
+ }
497
+
498
+ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
499
+ {
500
+ int32x2_t a_lo, a_hi, max;
501
+
502
+ a_lo = vget_low_s32(a);
503
+ a_hi = vget_high_s32(a);
504
+ max = vpmax_s32(a_lo, a_hi);
505
+ max = vpmax_s32(max, max);
506
+
507
+ return vget_lane_s32(max, 0);
508
+ }
509
+
510
+ // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
511
+ // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
512
+ #define PALIGN_NEON(Offset,Type,Command) \
513
+ template<>\
514
+ struct palign_impl<Offset,Type>\
515
+ {\
516
+ EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
517
+ {\
518
+ if (Offset!=0)\
519
+ first = Command(first, second, Offset);\
520
+ }\
521
+ };\
522
+
523
+ PALIGN_NEON(0,Packet4f,vextq_f32)
524
+ PALIGN_NEON(1,Packet4f,vextq_f32)
525
+ PALIGN_NEON(2,Packet4f,vextq_f32)
526
+ PALIGN_NEON(3,Packet4f,vextq_f32)
527
+ PALIGN_NEON(0,Packet4i,vextq_s32)
528
+ PALIGN_NEON(1,Packet4i,vextq_s32)
529
+ PALIGN_NEON(2,Packet4i,vextq_s32)
530
+ PALIGN_NEON(3,Packet4i,vextq_s32)
531
+
532
+ #undef PALIGN_NEON
533
+
534
+ EIGEN_DEVICE_FUNC inline void
535
+ ptranspose(PacketBlock<Packet4f,4>& kernel) {
536
+ float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
537
+ float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
538
+
539
+ kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
540
+ kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
541
+ kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
542
+ kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
543
+ }
544
+
545
+ EIGEN_DEVICE_FUNC inline void
546
+ ptranspose(PacketBlock<Packet4i,4>& kernel) {
547
+ int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
548
+ int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
549
+ kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
550
+ kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
551
+ kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
552
+ kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
553
+ }
554
+
555
+ //---------- double ----------
556
+
557
+ // Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
558
+ // Confirmed at least with __apple_build_version__ = 6000054.
559
+ #ifdef __apple_build_version__
560
+ // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
561
+ // https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
562
+ // major toolchain updates.
563
+ #define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
564
+ #else
565
+ #define EIGEN_APPLE_DOUBLE_NEON_BUG 0
566
+ #endif
567
+
568
+ #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
569
+
570
+ // Bug 907: workaround missing declarations of the following two functions in the ADK
571
+ // Defining these functions as templates ensures that if these intrinsics are
572
+ // already defined in arm_neon.h, then our workaround doesn't cause a conflict
573
+ // and has lower priority in overload resolution.
574
+ template <typename T>
575
+ uint64x2_t vreinterpretq_u64_f64(T a)
576
+ {
577
+ return (uint64x2_t) a;
578
+ }
579
+
580
+ template <typename T>
581
+ float64x2_t vreinterpretq_f64_u64(T a)
582
+ {
583
+ return (float64x2_t) a;
584
+ }
585
+
586
+ typedef float64x2_t Packet2d;
587
+ typedef float64x1_t Packet1d;
588
+
589
+ template<> struct packet_traits<double> : default_packet_traits
590
+ {
591
+ typedef Packet2d type;
592
+ typedef Packet2d half;
593
+ enum {
594
+ Vectorizable = 1,
595
+ AlignedOnScalar = 1,
596
+ size = 2,
597
+ HasHalfPacket=0,
598
+
599
+ HasDiv = 1,
600
+ // FIXME check the Has*
601
+ HasSin = 0,
602
+ HasCos = 0,
603
+ HasLog = 0,
604
+ HasExp = 0,
605
+ HasSqrt = 0
606
+ };
607
+ };
608
+
609
+ template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
610
+
611
+ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
612
+
613
+ template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
614
+ {
615
+ const double countdown_raw[] = {0.0,1.0};
616
+ const Packet2d countdown = vld1q_f64(countdown_raw);
617
+ return vaddq_f64(pset1<Packet2d>(a), countdown);
618
+ }
619
+ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
620
+
621
+ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
622
+
623
+ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
624
+
625
+ template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
626
+
627
+ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
628
+
629
+ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
630
+
631
+ #ifdef __ARM_FEATURE_FMA
632
+ // See bug 936. See above comment about FMA for float.
633
+ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
634
+ #else
635
+ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
636
+ #endif
637
+
638
+ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
639
+
640
+ template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
641
+
642
+ // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
643
+ template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
644
+ {
645
+ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
646
+ }
647
+
648
+ template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
649
+ {
650
+ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
651
+ }
652
+
653
+ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
654
+ {
655
+ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
656
+ }
657
+
658
+ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
659
+ {
660
+ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
661
+ }
662
+
663
+ template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
664
+
665
+ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
666
+
667
+ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
668
+ {
669
+ return vld1q_dup_f64(from);
670
+ }
671
+ template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
672
+
673
+ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
674
+
675
+ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
676
+ {
677
+ Packet2d res = pset1<Packet2d>(0.0);
678
+ res = vsetq_lane_f64(from[0*stride], res, 0);
679
+ res = vsetq_lane_f64(from[1*stride], res, 1);
680
+ return res;
681
+ }
682
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
683
+ {
684
+ to[stride*0] = vgetq_lane_f64(from, 0);
685
+ to[stride*1] = vgetq_lane_f64(from, 1);
686
+ }
687
+ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
688
+
689
+ // FIXME only store the 2 first elements ?
690
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
691
+
692
+ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
693
+
694
+ template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
695
+
696
+ #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
697
+ // workaround ICE, see bug 907
698
+ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
699
+ #else
700
+ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
701
+ #endif
702
+
703
+ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
704
+ {
705
+ float64x2_t trn1, trn2;
706
+
707
+ // NEON zip performs interleaving of the supplied vectors.
708
+ // We perform two interleaves in a row to acquire the transposed vector
709
+ trn1 = vzip1q_f64(vecs[0], vecs[1]);
710
+ trn2 = vzip2q_f64(vecs[0], vecs[1]);
711
+
712
+ // Do the addition of the resulting vectors
713
+ return vaddq_f64(trn1, trn2);
714
+ }
715
+ // Other reduction functions:
716
+ // mul
717
+ #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
718
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
719
+ #else
720
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
721
+ #endif
722
+
723
+ // min
724
+ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
725
+
726
+ // max
727
+ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
728
+
729
+ // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
730
+ // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
731
+ #define PALIGN_NEON(Offset,Type,Command) \
732
+ template<>\
733
+ struct palign_impl<Offset,Type>\
734
+ {\
735
+ EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
736
+ {\
737
+ if (Offset!=0)\
738
+ first = Command(first, second, Offset);\
739
+ }\
740
+ };\
741
+
742
+ PALIGN_NEON(0,Packet2d,vextq_f64)
743
+ PALIGN_NEON(1,Packet2d,vextq_f64)
744
+ #undef PALIGN_NEON
745
+
746
+ EIGEN_DEVICE_FUNC inline void
747
+ ptranspose(PacketBlock<Packet2d,2>& kernel) {
748
+ float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
749
+ float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
750
+
751
+ kernel.packet[0] = trn1;
752
+ kernel.packet[1] = trn2;
753
+ }
754
+ #endif // EIGEN_ARCH_ARM64
755
+
756
+ } // end namespace internal
757
+
758
+ } // end namespace Eigen
759
+
760
+ #endif // EIGEN_PACKET_MATH_NEON_H